In [1]:
# --- 1. Load Processed Data ---
import numpy as np
import os
import time
import pandas as pd
import joblib

print("Step 1: Loading processed data...")
results_dir = '../results'
X_train_pca = np.load(os.path.join(results_dir, 'X_train_pca.npy'))
X_test_pca = np.load(os.path.join(results_dir, 'X_test_pca.npy'))
y_train = np.load(os.path.join(results_dir, 'y_train.npy'))
y_test = np.load(os.path.join(results_dir, 'y_test.npy'))
class_names = np.load(os.path.join(results_dir, 'class_names.npy'))

print("Data loaded successfully.")
print(f"Shape of X_train_pca: {X_train_pca.shape}")
print(f"Shape of y_train: {y_train.shape}")


# --- 2. Model Training and Evaluation ---
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

print("\nStep 2: Training and Evaluating Models...")

# (This section is the same as the previous 'Part 4' code)
# I have pasted it here for completeness.

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}
results = {}
for name, model in models.items():
    print(f"--- Training {name} ---")
    start_time = time.time()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[name] = {"Accuracy": accuracy, "F1 Score": f1}
    end_time = time.time()
    print(f"{name} trained in {end_time - start_time:.2f} seconds.")
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}\n")

print("--- Training MLP ---")
start_time = time.time()
y_train_cat = to_categorical(y_train, num_classes=len(class_names))
y_test_cat = to_categorical(y_test, num_classes=len(class_names))
mlp = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_pca.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(class_names), activation='softmax')
])
mlp.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = mlp.fit(X_train_pca, y_train_cat, epochs=30, batch_size=128, validation_split=0.1, verbose=0)
y_pred_probs = mlp.predict(X_test_pca)
y_pred_mlp = np.argmax(y_pred_probs, axis=1)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
f1_mlp = f1_score(y_test, y_pred_mlp, average='weighted')
results["MLP"] = {"Accuracy": accuracy_mlp, "F1 Score": f1_mlp}
end_time = time.time()
print(f"MLP trained in {end_time - start_time:.2f} seconds.")
print(f"Accuracy: {accuracy_mlp:.4f}, F1 Score: {f1_mlp:.4f}\n")

results_df = pd.DataFrame(results).T.sort_values(by="F1 Score", ascending=False)
print("--- Final Model Performance Comparison ---")
print(results_df)

results_df.to_csv(os.path.join(results_dir, 'model_performance.csv'))
print(f"\nResults saved to '{os.path.join(results_dir, 'model_performance.csv')}'")


# --- 3. Identify and Save Champion Model ---
print("\nStep 3: Identifying and saving the champion model...")

# Get the name of the best model from our results dataframe
champion_model_name = results_df.index[0]
print(f"Champion model is: {champion_model_name}")

# Re-initialize the champion model (if it's not the MLP)
if champion_model_name != "MLP":
    champion_model = models[champion_model_name]
    # Re-train it on the full training data
    champion_model.fit(X_train_pca, y_train)
    # Save the scikit-learn model
    joblib.dump(champion_model, '../models/champion_model.joblib')
    print("Champion model (scikit-learn) saved to '../models/champion_model.joblib'")
else:
    # Save the Keras (MLP) model
    mlp.save('../models/champion_model.h5')
    print("Champion model (MLP) saved to '../models/champion_model.h5'")


Step 1: Loading processed data...
Data loaded successfully.
Shape of X_train_pca: (21600, 50)
Shape of y_train: (21600,)

Step 2: Training and Evaluating Models...
--- Training Logistic Regression ---


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression trained in 16.07 seconds.
Accuracy: 0.6419, F1 Score: 0.6313

--- Training SVM ---
SVM trained in 23.89 seconds.
Accuracy: 0.7759, F1 Score: 0.7727

--- Training Random Forest ---
Random Forest trained in 49.91 seconds.
Accuracy: 0.8048, F1 Score: 0.7998

--- Training Gradient Boosting ---
Gradient Boosting trained in 1164.00 seconds.
Accuracy: 0.7644, F1 Score: 0.7608

--- Training MLP ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
MLP trained in 20.83 seconds.
Accuracy: 0.2620, F1 Score: 0.1842

--- Final Model Performance Comparison ---
                     Accuracy  F1 Score
Random Forest        0.804815  0.799797
SVM                  0.775926  0.772716
Gradient Boosting    0.764444  0.760782
Logistic Regression  0.641852  0.631258
MLP                  0.262037  0.184228

Results saved to '../results\model_performance.csv'

Step 3: Identifying and saving the champion model...
Champion model is: Random Forest
Champion model (scikit-learn) saved to '../models/champion_model.joblib'
