In [1]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [2]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))


TensorFlow version: 2.18.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV,
                                   RandomizedSearchCV, StratifiedKFold)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                           f1_score, precision_recall_fscore_support, make_scorer)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier
from scipy.stats import uniform, randint
import time
import warnings
warnings.filterwarnings('ignore')

In [4]:
np.random.seed(42)
tf.random.set_seed(42)

In [5]:
df = pd.read_csv('/content/insat_hyderabad_cloud_dataset_full_5months.csv')

In [6]:
if df.isna().sum().sum() > 0:
    df=df.dropna()

In [7]:
physical_features = ['BT_3.9', 'BT_6.7', 'BT_10.8', 'BT_12.0', 'BTD_3.9-10.8',
                    'BTD_12-10.8', 'BTD_6.7-10.8', 'BTD_TIR1-TIR2', 'BTD_3channel']

In [8]:
X = df[physical_features].copy()
y = df['cloud_type'].copy()

In [9]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
n_classes = len(le.classes_)

In [10]:
df.isna().sum()

Unnamed: 0,0
BT_3.9,0
BT_6.7,0
BT_10.8,0
BT_12.0,0
BTD_3.9-10.8,0
BTD_12-10.8,0
BTD_6.7-10.8,0
BTD_TIR1-TIR2,0
BTD_3channel,0
Wind_Gust_Estimate,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [14]:
print(f"Training set shape after SMOTE: {X_train_balanced.shape}")
print(f"Classes: {le.classes_}")

Training set shape after SMOTE: (19555, 9)
Classes: ['Cirrus' 'Clear' 'High_thick' 'Low_thick' 'Partial']


In [15]:
df.head()

Unnamed: 0,BT_3.9,BT_6.7,BT_10.8,BT_12.0,BTD_3.9-10.8,BTD_12-10.8,BTD_6.7-10.8,BTD_TIR1-TIR2,BTD_3channel,Wind_Gust_Estimate,Latitude,Longitude,cloud_label,cloud_mask,timestamp,date,cloud_type
0,294.2,254.79,294.04,293.74,0.16,-0.3,-39.25,0.3,-38.65,11.08,17.38,78.47,0.0,0.0,3SIMG_01JAN2025_1430_L1C_ASIA_MER_V01R00.h5,20250101,Clear
1,293.55,255.12,293.6,293.37,-0.05,-0.23,-38.48,0.23,-38.02,11.18,17.38,78.47,0.0,0.0,3SIMG_01JAN2025_1500_L1C_ASIA_MER_V01R00.h5,20250101,Clear
2,292.72,255.03,293.31,293.64,-0.59,0.33,-38.28,-0.33,-38.94,11.03,17.38,78.47,0.0,0.0,3SIMG_01JAN2025_1530_L1C_ASIA_MER_V01R00.h5,20250101,Clear
3,293.08,254.87,293.24,292.87,-0.16,-0.37,-38.37,0.37,-37.63,11.25,17.38,78.47,0.0,0.0,3SIMG_01JAN2025_1600_L1C_ASIA_MER_V01R00.h5,20250101,Clear
4,292.47,255.02,293.19,293.32,-0.72,0.13,-38.17,-0.13,-38.43,11.12,17.38,78.47,0.0,0.0,3SIMG_01JAN2025_1630_L1C_ASIA_MER_V01R00.h5,20250101,Clear


In [16]:
values, counts = np.unique(y_train_balanced, return_counts=True)
for val, count in zip(values, counts):
    print(f"Class {val}: {count}")

Class 0: 3911
Class 1: 3911
Class 2: 3911
Class 3: 3911
Class 4: 3911


In [17]:
print("\n2. CV SEARCH SETUP")
print("-" * 30)
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = 'f1_weighted'
cv_results = {}
best_models = {}


2. CV SEARCH SETUP
------------------------------


In [18]:
print("\n3.1 XGBoost with GridSearchCV")
print("-" * 30)

start_time = time.time()

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xgb_base = xgb.XGBClassifier(
    objective='multi:softprob',
    random_state=42,
    eval_metric='mlogloss',
    n_jobs=1
)

xgb_grid = GridSearchCV(
    estimator=xgb_base,
    param_grid=xgb_param_grid,
    cv=cv_folds,
    scoring=scoring,
    n_jobs=-1,
    verbose=1
)

xgb_grid.fit(X_train_balanced, y_train_balanced)
xgb_best = xgb_grid.best_estimator_


xgb_pred = xgb_best.predict(X_test_scaled)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred, average='weighted')

xgb_time = time.time() - start_time

cv_results['XGBoost'] = {
    'best_params': xgb_grid.best_params_,
    'best_cv_score': xgb_grid.best_score_,
    'test_accuracy': xgb_accuracy,
    'test_f1': xgb_f1,
    'training_time': xgb_time,
    'predictions': xgb_pred
}
best_models['XGBoost'] = xgb_best

print(f"XGBoost Best Parameters: {xgb_grid.best_params_}")
print(f"XGBoost Best CV Score: {xgb_grid.best_score_:.4f}")
print(f"XGBoost Test Accuracy: {xgb_accuracy:.4f}")
print(f"XGBoost Test F1-Score: {xgb_f1:.4f}")
print(f"XGBoost Training Time: {xgb_time:.2f} seconds")


3.1 XGBoost with GridSearchCV
------------------------------
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
XGBoost Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200, 'subsample': 0.8}
XGBoost Best CV Score: 0.9154
XGBoost Test Accuracy: 0.7595
XGBoost Test F1-Score: 0.7948
XGBoost Training Time: 2035.11 seconds


In [None]:
print("\n3.2 Random Forest with RandomizedSearchCV")
print("-" * 30)

start_time = time.time()

rf_param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_base = RandomForestClassifier(random_state=42, n_jobs=1)

rf_random = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=rf_param_dist,
    n_iter=50,  # Number of parameter settings sampled
    cv=cv_folds,
    scoring=scoring,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rf_random.fit(X_train_balanced, y_train_balanced)
rf_best = rf_random.best_estimator_

# Evaluate on test set
rf_pred = rf_best.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred, average='weighted')

rf_time = time.time() - start_time

cv_results['Random Forest'] = {
    'best_params': rf_random.best_params_,
    'best_cv_score': rf_random.best_score_,
    'test_accuracy': rf_accuracy,
    'test_f1': rf_f1,
    'training_time': rf_time,
    'predictions': rf_pred
}
best_models['Random Forest'] = rf_best

print(f"Random Forest Best Parameters: {rf_random.best_params_}")
print(f"Random Forest Best CV Score: {rf_random.best_score_:.4f}")
print(f"Random Forest Test Accuracy: {rf_accuracy:.4f}")
print(f"Random Forest Test F1-Score: {rf_f1:.4f}")
print(f"Random Forest Training Time: {rf_time:.2f} seconds")



3.2 Random Forest with RandomizedSearchCV
------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
print("\n3.3 SVM with GridSearchCV")
print("-" * 30)

start_time = time.time()

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

svm_base = SVC(random_state=42, probability=True)

svm_grid = GridSearchCV(
    estimator=svm_base,
    param_grid=svm_param_grid,
    cv=cv_folds,
    scoring=scoring,
    n_jobs=-1,
    verbose=1
)

svm_grid.fit(X_train_balanced, y_train_balanced)
svm_best = svm_grid.best_estimator_


svm_pred = svm_best.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred, average='weighted')

svm_time = time.time() - start_time

cv_results['SVM'] = {
    'best_params': svm_grid.best_params_,
    'best_cv_score': svm_grid.best_score_,
    'test_accuracy': svm_accuracy,
    'test_f1': svm_f1,
    'training_time': svm_time,
    'predictions': svm_pred
}
best_models['SVM'] = svm_best

print(f"SVM Best Parameters: {svm_grid.best_params_}")
print(f"SVM Best CV Score: {svm_grid.best_score_:.4f}")
print(f"SVM Test Accuracy: {svm_accuracy:.4f}")
print(f"SVM Test F1-Score: {svm_f1:.4f}")
print(f"SVM Training Time: {svm_time:.2f} seconds")

In [None]:
print("\n3.4 Logistic Regression with GridSearchCV")
print("-" * 30)

start_time = time.time()

lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000, 2000, 5000]
}

# Handle incompatible combinations
lr_param_grid_filtered = []
for C in lr_param_grid['C']:
    for penalty in lr_param_grid['penalty']:
        for solver in lr_param_grid['solver']:
            for max_iter in lr_param_grid['max_iter']:
                # Filter incompatible combinations
                if penalty == 'elasticnet' and solver != 'saga':
                    continue
                if penalty == 'l1' and solver == 'liblinear':
                    lr_param_grid_filtered.append({
                        'C': C, 'penalty': penalty, 'solver': solver, 'max_iter': max_iter
                    })
                elif penalty != 'l1':
                    lr_param_grid_filtered.append({
                        'C': C, 'penalty': penalty, 'solver': solver, 'max_iter': max_iter
                    })

lr_base = LogisticRegression(random_state=42, multi_class='ovr')

lr_grid = GridSearchCV(
    estimator=lr_base,
    param_grid=lr_param_grid_filtered,
    cv=cv_folds,
    scoring=scoring,
    n_jobs=-1,
    verbose=1
)

lr_grid.fit(X_train_balanced, y_train_balanced)
lr_best = lr_grid.best_estimator_

# Evaluate on test set
lr_pred = lr_best.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred, average='weighted')

lr_time = time.time() - start_time

cv_results['Logistic Regression'] = {
    'best_params': lr_grid.best_params_,
    'best_cv_score': lr_grid.best_score_,
    'test_accuracy': lr_accuracy,
    'test_f1': lr_f1,
    'training_time': lr_time,
    'predictions': lr_pred
}
best_models['Logistic Regression'] = lr_best

print(f"Logistic Regression Best Parameters: {lr_grid.best_params_}")
print(f"Logistic Regression Best CV Score: {lr_grid.best_score_:.4f}")
print(f"Logistic Regression Test Accuracy: {lr_accuracy:.4f}")
print(f"Logistic Regression Test F1-Score: {lr_f1:.4f}")
print(f"Logistic Regression Training Time: {lr_time:.2f} seconds")

In [None]:
print("\n3.5 K-Nearest Neighbors with GridSearchCV")
print("-" * 30)

start_time = time.time()

knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2]  # For minkowski distance
}

knn_base = KNeighborsClassifier()

knn_grid = GridSearchCV(
    estimator=knn_base,
    param_grid=knn_param_grid,
    cv=cv_folds,
    scoring=scoring,
    n_jobs=-1,
    verbose=1
)

knn_grid.fit(X_train_balanced, y_train_balanced)
knn_best = knn_grid.best_estimator_


knn_pred = knn_best.predict(X_test_scaled)
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred, average='weighted')

knn_time = time.time() - start_time

cv_results['KNN'] = {
    'best_params': knn_grid.best_params_,
    'best_cv_score': knn_grid.best_score_,
    'test_accuracy': knn_accuracy,
    'test_f1': knn_f1,
    'training_time': knn_time,
    'predictions': knn_pred
}
best_models['KNN'] = knn_best

print(f"KNN Best Parameters: {knn_grid.best_params_}")
print(f"KNN Best CV Score: {knn_grid.best_score_:.4f}")
print(f"KNN Test Accuracy: {knn_accuracy:.4f}")
print(f"KNN Test F1-Score: {knn_f1:.4f}")
print(f"KNN Training Time: {knn_time:.2f} seconds")


In [None]:
print("\n3.6 Decision Tree with GridSearchCV")
print("-" * 30)

start_time = time.time()

dt_param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

dt_base = DecisionTreeClassifier(random_state=42)

dt_grid = GridSearchCV(
    estimator=dt_base,
    param_grid=dt_param_grid,
    cv=cv_folds,
    scoring=scoring,
    n_jobs=-1,
    verbose=1
)

dt_grid.fit(X_train_balanced, y_train_balanced)
dt_best = dt_grid.best_estimator_


dt_pred = dt_best.predict(X_test_scaled)
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred, average='weighted')

dt_time = time.time() - start_time

cv_results['Decision Tree'] = {
    'best_params': dt_grid.best_params_,
    'best_cv_score': dt_grid.best_score_,
    'test_accuracy': dt_accuracy,
    'test_f1': dt_f1,
    'training_time': dt_time,
    'predictions': dt_pred
}
best_models['Decision Tree'] = dt_best

print(f"Decision Tree Best Parameters: {dt_grid.best_params_}")
print(f"Decision Tree Best CV Score: {dt_grid.best_score_:.4f}")
print(f"Decision Tree Test Accuracy: {dt_accuracy:.4f}")
print(f"Decision Tree Test F1-Score: {dt_f1:.4f}")
print(f"Decision Tree Training Time: {dt_time:.2f} seconds")

In [None]:
print("\n3.7 Neural Network with Manual Hyperparameter Search")
print("-" * 30)

start_time = time.time()

# Define function to create neural network
def create_nn_model(neurons_1=128, neurons_2=64, neurons_3=32, dropout_1=0.3,
                   dropout_2=0.3, dropout_3=0.2, learning_rate=0.001):
    model = Sequential([
        Dense(neurons_1, activation='relu', input_shape=(X_train_balanced.shape[1],)),
        BatchNormalization(),
        Dropout(dropout_1),

        Dense(neurons_2, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_2),

        Dense(neurons_3, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_3),

        Dense(n_classes, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


y_train_balanced_categorical = to_categorical(y_train_balanced, num_classes=n_classes)
y_test_categorical = to_categorical(y_test, num_classes=n_classes)


nn_configs = [
    {'neurons_1': 64, 'neurons_2': 32, 'neurons_3': 16, 'dropout_1': 0.2, 'dropout_2': 0.2, 'dropout_3': 0.1, 'learning_rate': 0.001},
    {'neurons_1': 128, 'neurons_2': 64, 'neurons_3': 32, 'dropout_1': 0.3, 'dropout_2': 0.3, 'dropout_3': 0.2, 'learning_rate': 0.001},
    {'neurons_1': 256, 'neurons_2': 128, 'neurons_3': 64, 'dropout_1': 0.4, 'dropout_2': 0.4, 'dropout_3': 0.3, 'learning_rate': 0.0005},
    {'neurons_1': 128, 'neurons_2': 64, 'neurons_3': 32, 'dropout_1': 0.2, 'dropout_2': 0.2, 'dropout_3': 0.1, 'learning_rate': 0.01},
]

best_nn_score = 0
best_nn_config = None
best_nn_model = None

for i, config in enumerate(nn_configs):
    print(f"Testing NN Configuration {i+1}/{len(nn_configs)}: {config}")


    model = create_nn_model(**config)

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.0001)

    history = model.fit(
        X_train_balanced, y_train_balanced_categorical,
        validation_split=0.2,
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=0
    )


    pred_proba = model.predict(X_test_scaled, verbose=0)
    pred = np.argmax(pred_proba, axis=1)
    f1 = f1_score(y_test, pred, average='weighted')

    print(f"Configuration {i+1} F1-Score: {f1:.4f}")

    if f1 > best_nn_score:
        best_nn_score = f1
        best_nn_config = config
        best_nn_model = model

nn_pred = np.argmax(best_nn_model.predict(X_test_scaled, verbose=0), axis=1)
nn_accuracy = accuracy_score(y_test, nn_pred)
nn_f1 = f1_score(y_test, nn_pred, average='weighted')

nn_time = time.time() - start_time

cv_results['Neural Network'] = {
    'best_params': best_nn_config,
    'best_cv_score': best_nn_score,
    'test_accuracy': nn_accuracy,
    'test_f1': nn_f1,
    'training_time': nn_time,
    'predictions': nn_pred
}
best_models['Neural Network'] = best_nn_model

print(f"Neural Network Best Configuration: {best_nn_config}")
print(f"Neural Network Best Score: {best_nn_score:.4f}")
print(f"Neural Network Test Accuracy: {nn_accuracy:.4f}")
print(f"Neural Network Test F1-Score: {nn_f1:.4f}")
print(f"Neural Network Training Time: {nn_time:.2f} seconds")

In [None]:
print("\n4. COMPREHENSIVE RESULTS ANALYSIS")
print("-" * 40)


detailed_comparison = pd.DataFrame({
    'Model': list(cv_results.keys()),
    'Best_CV_Score': [cv_results[model]['best_cv_score'] for model in cv_results.keys()],
    'Test_Accuracy': [cv_results[model]['test_accuracy'] for model in cv_results.keys()],
    'Test_F1_Score': [cv_results[model]['test_f1'] for model in cv_results.keys()],
    'Training_Time': [cv_results[model]['training_time'] for model in cv_results.keys()]
})


detailed_comparison = detailed_comparison.sort_values('Test_F1_Score', ascending=False)
print("DETAILED MODEL COMPARISON:")
print("=" * 60)
print(detailed_comparison.to_string(index=False, float_format='%.4f'))


best_model_name = detailed_comparison.iloc[0]['Model']
best_model_f1 = detailed_comparison.iloc[0]['Test_F1_Score']
best_model_accuracy = detailed_comparison.iloc[0]['Test_Accuracy']

print(f"\n OVERALL BEST MODEL: {best_model_name}")
print(f" Best Test F1-Score: {best_model_f1:.4f}")
print(f" Best Test Accuracy: {best_model_accuracy:.4f}")


print(f"\n IMPROVEMENT ANALYSIS:")
print("-" * 30)
for model in cv_results.keys():
    print(f"{model}:")
    print(f"  Best Parameters: {cv_results[model]['best_params']}")
    print(f"  CV Score: {cv_results[model]['best_cv_score']:.4f}")
    print(f"  Test F1: {cv_results[model]['test_f1']:.4f}")
    print(f"  Training Time: {cv_results[model]['training_time']:.2f}s")
    print()


print(f"\n DETAILED CLASSIFICATION REPORT - {best_model_name}:")
print("-" * 50)
best_predictions = cv_results[best_model_name]['predictions']
print(classification_report(y_test, best_predictions, target_names=le.classes_))


In [None]:
plt.figure(figsize=(20, 15))


plt.subplot(3, 3, 1)
models = detailed_comparison['Model']
test_f1_scores = detailed_comparison['Test_F1_Score']
colors = ['gold' if model == best_model_name else 'lightblue' for model in models]

bars = plt.bar(models, test_f1_scores, color=colors)
plt.title('Model Performance Comparison (Test F1-Score)', fontsize=14, fontweight='bold')
plt.xlabel('Models')
plt.ylabel('F1-Score')
plt.xticks(rotation=45)
plt.ylim(0, 1)


for i, v in enumerate(test_f1_scores):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontweight='bold')


plt.subplot(3, 3, 2)
training_times = detailed_comparison['Training_Time']
plt.bar(models, training_times, color='lightcoral')
plt.title('Training Time Comparison', fontsize=14, fontweight='bold')
plt.xlabel('Models')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45)


plt.subplot(3, 3, 3)
cv_scores = detailed_comparison['Best_CV_Score']
plt.scatter(cv_scores, test_f1_scores, s=100, alpha=0.7)
for i, model in enumerate(models):
    plt.annotate(model, (cv_scores.iloc[i], test_f1_scores.iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.plot([0, 1], [0, 1], 'r--', alpha=0.5)
plt.title('CV Score vs Test Score', fontsize=14, fontweight='bold')
plt.xlabel('Best CV Score')
plt.ylabel('Test F1-Score')


plt.subplot(3, 3, 4)
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('Actual')


if best_model_name in ['XGBoost', 'Random Forest', 'Decision Tree']:
    plt.subplot(3, 3, 5)
    feature_importance = best_models[best_model_name].feature_importances_
    feature_names = physical_features


    indices = np.argsort(feature_importance)[::-1]

    plt.bar(range(len(feature_importance)), feature_importance[indices])
    plt.title(f'Feature Importance - {best_model_name}', fontsize=14, fontweight='bold')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.xticks(range(len(feature_importance)),
               [feature_names[i] for i in indices], rotation=45)


plt.subplot(3, 3, 6)

complexity_estimates = {
    'Logistic Regression': 1,
    'KNN': 2,
    'Decision Tree': 3,
    'SVM': 4,
    'Random Forest': 5,
    'XGBoost': 6,
    'Neural Network': 7
}

complexities = [complexity_estimates.get(model, 0) for model in models]
plt.scatter(complexities, test_f1_scores, s=100, alpha=0.7, c=colors)
for i, model in enumerate(models):
    plt.annotate(model, (complexities[i], test_f1_scores.iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.title('Model Complexity vs Performance', fontsize=14, fontweight='bold')
plt.xlabel('Model Complexity (Estimated)')
plt.ylabel('Test F1-Score')


plt.subplot(3, 3, 7)
test_accuracies = detailed_comparison['Test_Accuracy']
plt.scatter(test_accuracies, test_f1_scores, s=100, alpha=0.7, c=colors)
for i, model in enumerate(models):
    plt.annotate(model, (test_accuracies.iloc[i], test_f1_scores.iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.plot([0, 1], [0, 1], 'r--', alpha=0.5)
plt.title('Accuracy vs F1-Score', fontsize=14, fontweight='bold')
plt.xlabel('Test Accuracy')
plt.ylabel('Test F1-Score')


plt.subplot(3, 3, 8)
plt.scatter(training_times, test_f1_scores, s=100, alpha=0.7, c=colors)
for i, model in enumerate(models):
    plt.annotate(model, (training_times.iloc[i], test_f1_scores.iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.title('Training Time vs Performance', fontsize=14, fontweight='bold')
plt.xlabel('Training Time (seconds)')
plt.ylabel('Test F1-Score')
plt.xscale('log')


plt.subplot(3, 3, 9)

performance_matrix = np.array([test_f1_scores]).T
sns.heatmap(performance_matrix, annot=True, fmt='.3f', cmap='RdYlGn',
            yticklabels=models, xticklabels=['F1-Score'], cbar_kws={'label': 'Score'})
plt.title('Performance Heatmap', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
print("\n5. HYPERPARAMETER ANALYSIS")
print("-" * 40)

print("🔍 KEY INSIGHTS FROM HYPERPARAMETER TUNING:")
print("=" * 50)


for model_name in cv_results.keys():
    if model_name == 'Neural Network':
        continue

    best_params = cv_results[model_name]['best_params']
    print(f"\n{model_name.upper()}:")
    print("-" * len(model_name))

    if model_name == 'XGBoost':
        print(f"• Optimal n_estimators: {best_params['n_estimators']} (complexity vs performance)")
        print(f"• Optimal max_depth: {best_params['max_depth']} (overfitting control)")
        print(f"• Optimal learning_rate: {best_params['learning_rate']} (convergence speed)")
        print(f"• Optimal subsample: {best_params['subsample']} (regularization)")

    elif model_name == 'Random Forest':
        print(f"• Optimal n_estimators: {best_params['n_estimators']} (ensemble size)")
        print(f"• Optimal max_depth: {best_params.get('max_depth', 'None')} (tree complexity)")
        print(f"• Optimal min_samples_split: {best_params['min_samples_split']} (split control)")
        print(f"• Optimal max_features: {best_params['max_features']} (feature randomness)")

    elif model_name == 'SVM':
        print(f"• Optimal C: {best_params['C']} (regularization strength)")
        print(f"• Optimal gamma: {best_params['gamma']} (kernel coefficient)")
        print(f"• Optimal kernel: {best_params['kernel']} (decision boundary)")

    elif model_name == 'KNN':
        print(f"• Optimal n_neighbors: {best_params['n_neighbors']} (local smoothness)")
        print(f"• Optimal weights: {best_params['weights']} (neighbor weighting)")
        print(f"• Optimal metric: {best_params['metric']} (distance measure)")

In [None]:
print(f"\n6. CROSS-VALIDATION INSIGHTS")
print("-" * 40)

print("📈 CV SCORE vs TEST SCORE ANALYSIS:")
print("=" * 40)

for model_name in cv_results.keys():
    cv_score = cv_results[model_name]['best_cv_score']
    test_score = cv_results[model_name]['test_f1']
    difference = abs(cv_score - test_score)

    if difference < 0.02:
        stability = "🟢 Excellent"
    elif difference < 0.05:
        stability = "🟡 Good"
    else:
        stability = "🔴 Concerning"

    print(f"{model_name}:")
    print(f"  CV Score: {cv_score:.4f}")
    print(f"  Test Score: {test_score:.4f}")
    print(f"  Difference: {difference:.4f} ({stability})")
    print()

In [None]:
print(f"\n8. SAVING ENHANCED RESULTS")
print("-" * 40)


detailed_comparison.to_csv('enhanced_model_comparison_results.csv', index=False)
print("Enhanced model comparison saved to 'enhanced_model_comparison_results.csv'")


hyperparams_df = pd.DataFrame([
    {'Model': model, 'Best_Parameters': str(cv_results[model]['best_params'])}
    for model in cv_results.keys()
])
hyperparams_df.to_csv('best_hyperparameters.csv', index=False)
print("Best hyperparameters saved to 'best_hyperparameters.csv'")

# Save best model
import joblib
if best_model_name not in ['Neural Network']:
    joblib.dump(best_models[best_model_name], f'best_tuned_model_{best_model_name.lower().replace(" ", "_")}.pkl')
    joblib.dump(scaler, 'enhanced_feature_scaler.pkl')
    joblib.dump(le, 'enhanced_label_encoder.pkl')
    print(f"Best tuned model saved as 'best_tuned_model_{best_model_name.lower().replace(' ', '_')}.pkl'")

print(f"\nENHANCED CLOUD CLASSIFICATION PIPELINE COMPLETE!")
print("=" * 60)
print(f"Best Model: {best_model_name}")
print(f"Best F1-Score: {best_model_f1:.4f}")
print(f"Performance Improvement: Achieved through systematic hyperparameter optimization")
print("Ready for production deployment with optimized parameters!")
print("=" * 60)