In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import GridSearchCV, train_test_split
from joblib import load, dump
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')  # Ignore deprecation warnings for now

# Define paths
processed_data_path = '../data/processed/'
models_path = '../models/'

# Create models directory if it doesn't exist
os.makedirs(models_path, exist_ok=True)

# Load preprocessed data
X_train = np.load(os.path.join(processed_data_path, 'X_train.npy'))
X_test = np.load(os.path.join(processed_data_path, 'X_test.npy'))
y_train = np.load(os.path.join(processed_data_path, 'y_train.npy'))
y_test = np.load(os.path.join(processed_data_path, 'y_test.npy'))

# Check shapes
print("Shapes - X_train:", X_train.shape, "X_test:", X_test.shape)
print("Shapes - y_train:", y_train.shape, "y_test:", y_test.shape)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
X_train_res = X_train_res.toarray() if hasattr(X_train_res, 'toarray') else X_train_res  # Ensure dense
y_train_res = y_train_res.astype(np.float32)  # Ensure picklable type
print("Shapes after SMOTE - X_train_res:", X_train_res.shape, "y_train_res:", y_train_res.shape)

# Hyperparameter Tuning for Random Forest with subset
param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
X_train_sub, _, y_train_sub, _ = train_test_split(X_train_res, y_train_res, train_size=0.1, random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1_weighted', n_jobs=1)
grid_search.fit(X_train_sub, y_train_sub)
print("Best parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train_res, y_train_res)  # Retrain on full data
dump(best_rf_model, os.path.join(models_path, 'random_forest_tuned.pkl'))

# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(X_train_res, y_train_res)
dump(log_reg, os.path.join(models_path, 'logistic_regression_smote.pkl'))

# Train Tuned Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, class_weight='balanced')
rf_model.fit(X_train_res, y_train_res)
dump(rf_model, os.path.join(models_path, 'random_forest_smote_tuned.pkl'))

# Prepare data for Neural Network (one-hot encode y for multi-class)
y_train_cat = to_categorical(y_train_res, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

# Build and train Neural Network
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_res.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # 3 classes
])
nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_res, y_train_cat, epochs=10, batch_size=32, validation_split=0.2, verbose=1)
nn_model.save(os.path.join(models_path, 'neural_net.h5'))

# Evaluate all models
models = {'Logistic Regression': log_reg, 'Random Forest': rf_model, 'Tuned Random Forest': best_rf_model}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score for {name}: {roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr'):.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.show()

# Evaluate Neural Network
y_pred_nn = nn_model.predict(X_test)
y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)
print("\nClassification Report for Neural Network:")
print(classification_report(y_test, y_pred_nn_classes))
print(f"ROC-AUC Score for Neural Network: {roc_auc_score(y_test, y_pred_nn, multi_class='ovr', average='weighted'):.4f}")

cm_nn = confusion_matrix(y_test, y_pred_nn_classes)
disp_nn = ConfusionMatrixDisplay(confusion_matrix=cm_nn)
disp_nn.plot(cmap='Blues')
plt.title('Confusion Matrix - Neural Network')
plt.show()

Shapes - X_train: (183112, 48) X_test: (78477, 48)
Shapes - y_train: (183112,) y_test: (78477,)
Shapes after SMOTE - X_train_res: (455199, 48) y_train_res: (455199,)
