In [19]:
!pip install --quiet tensorflow opencv-python pandas scikit-learn
!pip install scikit-image scikit-plot

Collecting scikit-image
  Downloading scikit_image-0.21.0-cp38-cp38-win_amd64.whl.metadata (14 kB)
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting networkx>=2.8 (from scikit-image)
  Downloading networkx-3.1-py3-none-any.whl.metadata (5.3 kB)
Collecting imageio>=2.27 (from scikit-image)
  Downloading imageio-2.35.1-py3-none-any.whl.metadata (4.9 kB)
Collecting tifffile>=2022.8.12 (from scikit-image)
  Downloading tifffile-2023.7.10-py3-none-any.whl.metadata (31 kB)
Collecting PyWavelets>=1.1.1 (from scikit-image)
  Downloading PyWavelets-1.4.1-cp38-cp38-win_amd64.whl.metadata (1.9 kB)
Collecting lazy_loader>=0.2 (from scikit-image)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Downloading scikit_image-0.21.0-cp38-cp38-win_amd64.whl (22.7 MB)
   ---------------------------------------- 0.0/22.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/22.7 MB ? eta -:--:--
   ---------------------------------

1. SETUP AND IMPORTS

In [2]:
#import libraries
import os
import numpy as np
import pandas as pd
import cv2
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from skimage.feature import hog
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import scikitplot as skplt


2. CONFIGURATION

In [3]:
class Config:
    BASE_PATH = "C:/Users/HP/Desktop/open-ai-x-data-vision-pillar-spring-2025"
    IMG_SIZE = 224
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    BATCH_SIZE = 32
    EPOCHS = 30
    
    # Feature extraction
    HOG_ORIENTATIONS = 8
    HOG_PIXELS_PER_CELL = (16, 16)
    HIST_BINS = [8, 8, 8]
    
    # Ensemble weights
    NN_WEIGHT = 0.6
    RF_WEIGHT = 0.25
    SVM_WEIGHT = 0.15

3.  DATA LOADER

In [4]:
class DataLoader:
    def __init__(self):
        self.train_df = pd.read_csv(os.path.join(Config.BASE_PATH, "train.csv"))
        self.test_df = pd.read_csv(os.path.join(Config.BASE_PATH, "test.csv"))
        self.le = LabelEncoder()
        
    def _extract_features(self, img):
        """Extract HOG and color histogram features"""
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        hog_features = hog(gray, 
                          orientations=Config.HOG_ORIENTATIONS,
                          pixels_per_cell=Config.HOG_PIXELS_PER_CELL,
                          cells_per_block=(1, 1))
        
        hist = cv2.calcHist([img], [0, 1, 2], None, 
                           Config.HIST_BINS, [0, 256, 0, 256, 0, 256])
        return np.hstack([hog_features, hist.flatten()])
    
    def load_data(self, extract_features=False):
        """Load and preprocess images"""
        def _process_df(df, label=True):
            X, X_features, y = [], [], []
            for _, row in df.iterrows():
                img = cv2.imread(os.path.join(Config.BASE_PATH, row['filename']))
                if img is None:
                    continue
                    
                img_resized = cv2.resize(img, (Config.IMG_SIZE, Config.IMG_SIZE))
                X.append(preprocess_input(img_resized))
                
                if extract_features:
                    X_features.append(self._extract_features(img_resized))
                
                if label:
                    y.append(row['class'])
                    
            return (np.array(X), np.array(X_features), np.array(y)) if extract_features else np.array(X)
        
        # Load training data
        if extract_features:
            X_train, X_train_features, y_train = _process_df(self.train_df, label=True)
            X_test, X_test_features, _ = _process_df(self.test_df, label=False)
            
            # Encode labels
            y_encoded = self.le.fit_transform(y_train)
            return {
                'X_train': X_train,
                'X_train_features': X_train_features,
                'y_train': y_encoded,
                'X_test': X_test,
                'X_test_features': X_test_features
            }
        else:
            X_train, y_train = _process_df(self.train_df, label=True)
            X_test = _process_df(self.test_df, label=False)
            y_encoded = self.le.fit_transform(y_train)
            return {
                'X_train': X_train,
                'y_train': y_encoded,
                'X_test': X_test
            }

4.  MODEL DEFINITIONS

In [5]:
class TraditionalML:
    @staticmethod
    def train_random_forest(X_train, y_train):
        print("\n=== Training Random Forest ===")
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=Config.RANDOM_STATE
        )
        model.fit(X_train, y_train)
        return model
    
    @staticmethod
    def train_svm(X_train, y_train):
        print("\n=== Training SVM ===")
        model = SVC(
            kernel='rbf',
            C=10,
            gamma='scale',
            probability=True,
            random_state=Config.RANDOM_STATE
        )
        model.fit(X_train, y_train)
        return model
    
    @staticmethod
    def run_kmeans(X, n_clusters):
        print("\n=== Running K-Means Clustering ===")
        model = KMeans(
            n_clusters=n_clusters,
            random_state=Config.RANDOM_STATE
        )
        return model.fit_predict(X)

class NeuralNetwork:
    @staticmethod
    def build_model(input_shape, num_classes):
        base_model = EfficientNetB0(
            include_top=False,
            weights='imagenet',
            input_shape=input_shape
        )
        base_model.trainable = True
        
        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(1200, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        outputs = Dense(num_classes, activation='softmax')(x)
        
        model = Model(inputs=base_model.input, outputs=outputs)
        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    @staticmethod
    def get_data_augmenter():
        return ImageDataGenerator(
            rotation_range=40,
            zoom_range=0.25,
            width_shift_range=0.2,
            height_shift_range=0.2,
            horizontal_flip=True,
            brightness_range=[0.7, 1.3],
            shear_range=0.2,
            fill_mode='nearest'
        )

5.  EVALUATION UTILS

In [6]:
class Evaluation:
    @staticmethod
    def evaluate_model(model, X_val, y_val, model_name):
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        print(f"{model_name} Validation Accuracy: {acc:.4f}")
        return acc
    
    @staticmethod
    def plot_clusters(X, cluster_labels):
        skplt.metrics.plot_silhouette(X, cluster_labels)
        plt.title('Cluster Quality Analysis')
        plt.savefig('cluster_analysis.png')
        plt.close()

6.  MAIN PIPELINE

In [7]:
def main():
    # Load data with features
    loader = DataLoader()
    data = loader.load_data(extract_features=True)
    
    # Split data
    X_train_nn = data['X_train']
    y_train = data['y_train']
    X_test_nn = data['X_test']
    
    # Traditional ML split
    X_train_ml, X_val_ml, y_train_ml, y_val_ml = train_test_split(
        data['X_train_features'], 
        y_train,
        test_size=Config.TEST_SIZE,
        stratify=y_train,
        random_state=Config.RANDOM_STATE
    )
    
    # 1. Train traditional models
    rf_model = TraditionalML.train_random_forest(X_train_ml, y_train_ml)
    svm_model = TraditionalML.train_svm(X_train_ml, y_train_ml)
    
    # Evaluate traditional models
    Evaluation.evaluate_model(rf_model, X_val_ml, y_val_ml, "Random Forest")
    Evaluation.evaluate_model(svm_model, X_val_ml, y_val_ml, "SVM")
    
    # 2. Clustering analysis
    cluster_labels = TraditionalML.run_kmeans(data['X_train_features'], len(np.unique(y_train)))
    Evaluation.plot_clusters(data['X_train_features'], cluster_labels)
    
    # 3. Neural Network
    y_cat = to_categorical(y_train)
    X_train_nn, X_val_nn, y_train_nn, y_val_nn = train_test_split(
        X_train_nn, y_cat,
        test_size=Config.TEST_SIZE,
        stratify=y_train,
        random_state=Config.RANDOM_STATE
    )
    
    nn_model = NeuralNetwork.build_model(
        input_shape=(Config.IMG_SIZE, Config.IMG_SIZE, 3),
        num_classes=y_cat.shape[1]
    )
    
    train_gen = NeuralNetwork.get_data_augmenter()
    train_gen.fit(X_train_nn)
    
    history = nn_model.fit(
        train_gen.flow(X_train_nn, y_train_nn, batch_size=Config.BATCH_SIZE),
        epochs=Config.EPOCHS,
        validation_data=(X_val_nn, y_val_nn),
        callbacks=[
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)
        ],
        verbose=1
    )
    
    # 4. Ensemble predictions
    nn_probs = nn_model.predict(X_test_nn)
    rf_probs = rf_model.predict_proba(data['X_test_features'])
    svm_probs = svm_model.predict_proba(data['X_test_features'])
    
    ensemble_probs = (
        Config.NN_WEIGHT * nn_probs +
        Config.RF_WEIGHT * rf_probs +
        Config.SVM_WEIGHT * svm_probs
    )
    
    final_preds = loader.le.inverse_transform(np.argmax(ensemble_probs, axis=1))
    
    # Save results
    submission = pd.DataFrame({
        "id": loader.test_df["id"],
        "label": final_preds
    })
    submission.to_csv("final_submission12.csv", index=False)
    print("\n✅ Final ensemble submission saved!")

  
  # Save all models
    os.makedirs(os.path.join(Config.BASE_PATH, 'models'), exist_ok=True)
    
    nn_model.save(os.path.join(Config.BASE_PATH, 'models', 'efficientnet_nn_model.h5'))
   
    
    print("\n✅  model saved to 'models' folder!")


if __name__ == "__main__":
    main()


=== Training Random Forest ===

=== Training SVM ===
Random Forest Validation Accuracy: 1.0000
SVM Validation Accuracy: 1.0000

=== Running K-Means Clustering ===


  super()._check_params_vs_input(X, default_n_init=10)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30

✅ Final ensemble submission saved!


  saving_api.save_model(



✅ All models saved to 'models' folder!
