In [5]:
# 📦 Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# 📢 Ignore only harmless CPU warning
import warnings
warnings.filterwarnings('ignore', message='Could not find the number of physical cores')

# 📂 Load datasets
df_jm1 = pd.read_csv('../data/JM1.csv')
df_kc1 = pd.read_csv('../data/KC1.csv')

# 📋 Prepare datasets

# Prepare JM1
df_jm1['defects'] = df_jm1['defects'].astype(int)
X_jm1 = df_jm1.drop('defects', axis=1)
y_jm1 = df_jm1['defects']

scaler_jm1 = StandardScaler()
X_scaled_jm1 = scaler_jm1.fit_transform(X_jm1)

X_train_jm1, X_test_jm1, y_train_jm1, y_test_jm1 = train_test_split(
    X_scaled_jm1, y_jm1, test_size=0.2, stratify=y_jm1, random_state=42
)

smote = SMOTE(random_state=42)
X_train_smote_jm1, y_train_smote_jm1 = smote.fit_resample(X_train_jm1, y_train_jm1)

# Prepare KC1
df_kc1['defects'] = df_kc1['defects'].astype(int)
X_kc1 = df_kc1.drop('defects', axis=1)
y_kc1 = df_kc1['defects']

scaler_kc1 = StandardScaler()
X_scaled_kc1 = scaler_kc1.fit_transform(X_kc1)

X_train_kc1, X_test_kc1, y_train_kc1, y_test_kc1 = train_test_split(
    X_scaled_kc1, y_kc1, test_size=0.2, stratify=y_kc1, random_state=42
)

X_train_smote_kc1, y_train_smote_kc1 = smote.fit_resample(X_train_kc1, y_train_kc1)

print("✅ All datasets loaded, scaled, split, and SMOTE-balanced (without warnings)!")




In [9]:
# Define parameter grid for Random Forest
param_grid_rf_jm1 = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30]
}

# Create model
rf_jm1 = RandomForestClassifier(random_state=42)

# Grid Search
grid_search_rf_jm1 = GridSearchCV(estimator=rf_jm1, param_grid=param_grid_rf_jm1, 
                                  scoring='f1', cv=5, n_jobs=-1, verbose=2)

grid_search_rf_jm1.fit(X_train_smote_jm1, y_train_smote_jm1)

# Best parameters and score
print(f"Best Random Forest parameters for JM1: {grid_search_rf_jm1.best_params_}")
print(f"Best F1 score from tuning: {grid_search_rf_jm1.best_score_:.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Random Forest parameters for JM1: {'max_depth': 30, 'n_estimators': 300}
Best F1 score from tuning: 0.9067


In [6]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30]
}

# Create model
rf = RandomForestClassifier(random_state=42)

# Grid Search
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, 
                              scoring='f1', cv=5, n_jobs=-1, verbose=2)

grid_search_rf.fit(X_train_smote_kc1, y_train_smote_kc1)

# Best parameters and score
print(f"Best Random Forest parameters for KC1: {grid_search_rf.best_params_}")
print(f"Best F1 score from tuning: {grid_search_rf.best_score_:.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Random Forest parameters for KC1: {'max_depth': 20, 'n_estimators': 300}
Best F1 score from tuning: 0.9071


In [10]:
# Define parameter grid for ANN
param_grid_ann_jm1 = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100, 50)],
    'activation': ['relu', 'tanh']
}

# Create model
ann_jm1 = MLPClassifier(max_iter=500, random_state=42)

# Grid Search
grid_search_ann_jm1 = GridSearchCV(estimator=ann_jm1, param_grid=param_grid_ann_jm1, 
                                   scoring='f1', cv=5, n_jobs=-1, verbose=2)

grid_search_ann_jm1.fit(X_train_smote_jm1, y_train_smote_jm1)

# Best parameters and score
print(f"Best ANN parameters for JM1: {grid_search_ann_jm1.best_params_}")
print(f"Best F1 score from tuning: {grid_search_ann_jm1.best_score_:.4f}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best ANN parameters for JM1: {'activation': 'tanh', 'hidden_layer_sizes': (150, 100, 50)}
Best F1 score from tuning: 0.8821


In [7]:
# Define parameter grid for ANN
param_grid_ann = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100, 50)],
    'activation': ['relu', 'tanh']
}

# Create model
ann = MLPClassifier(max_iter=500, random_state=42)

# Grid Search
grid_search_ann = GridSearchCV(estimator=ann, param_grid=param_grid_ann, 
                               scoring='f1', cv=5, n_jobs=-1, verbose=2)

grid_search_ann.fit(X_train_smote_kc1, y_train_smote_kc1)

# Best parameters and score
print(f"Best ANN parameters for KC1: {grid_search_ann.best_params_}")
print(f"Best F1 score from tuning: {grid_search_ann.best_score_:.4f}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best ANN parameters for KC1: {'activation': 'relu', 'hidden_layer_sizes': (150, 100, 50)}
Best F1 score from tuning: 0.8947
