In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from scipy.stats import loguniform
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_val_predict
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

### Read Dataset

In [2]:
# Read the Data Set
path = '/Users/lks/Desktop/UM/Project Report/01_LBNL_FDD_Even/df_faults_even.xlsx'
path2 = '/Users/lks/Desktop/UM/Project Report/01_LBNL_FDD_Even/df_faults_woF.xlsx'

DF_faulty = pd.read_excel(path, index_col=0) # Engineered Feature Set
DF_faulty2 = pd.read_excel(path2, index_col=0) # Original Feature Set


df_filtered = DF_faulty2 # Switch to DF_faulty2 if needed

In [3]:
# Prepare the data
X = df_filtered.drop(columns=['Datetime', 'Fault'])
y = df_filtered['Fault']

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


### Multiple Classifiers with Cross-Validation

In [4]:
# Define models
models = {
    # Statistical Method
    "LR": LogisticRegression(random_state=42),
    "Ridge": RidgeClassifier(random_state=42),
    # Other Algorithms
    "KNN": KNeighborsClassifier(n_jobs=-1),
    "SVM": SVC(random_state=42),
    "SGD": SGDClassifier(random_state=42),
    # Neural Network-Based Model
    "MLP": MLPClassifier(random_state=42),
    # Tree-Based Model
    "DT": DecisionTreeClassifier(random_state=42),
    "RF": RandomForestClassifier(random_state=42, n_jobs=-1),
    "GB": GradientBoostingClassifier(random_state=42),
    "XGB": XGBClassifier(random_state=42, n_jobs=-1),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0),
    "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop through the models
f1_scores = {}
for model_name, model in models.items():
    # Create a pipeline with scaling and classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Cross-validated predictions
    y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=cv, n_jobs=-1)
    
    # Decode predictions and true labels
    y_pred_decoded = label_encoder.inverse_transform(y_pred_cv)
    y_true_decoded = label_encoder.inverse_transform(y_train)
    
    # Classification Report
    report = classification_report(y_true_decoded, y_pred_decoded)
    print(f"{model_name} Classification Report (CV on Training Set):\n{report}\n")
    
    # F1 score (macro average)
    f1_score_value = f1_score(y_true_decoded, y_pred_decoded, average='macro')
    f1_scores[model_name] = f1_score_value
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true_decoded, y_pred_decoded)
    target_names = label_encoder.classes_
    conf_matrix_df = pd.DataFrame(conf_matrix, index=target_names, columns=target_names)
    print(f"{model_name} Confusion Matrix (CV on Training Set):\n{conf_matrix_df}\n")

# Print F1 scores
print("F1 Scores (Cross-Validation on Training Set):")
for model_name, score in f1_scores.items():
    print(f"{model_name}: {score:.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR Classification Report (CV on Training Set):
              precision    recall  f1-score   support

         BPL       0.58      0.30      0.40      7008
         BPS       0.57      0.70      0.63      7008
         CHS       0.58      0.75      0.65      7008
         CPP       0.39      0.53      0.45      7008
         CTF       0.45      0.54      0.49      7008
       CTPID       0.36      0.28      0.32      7007
         CTS       0.42      0.28      0.33      7008
           N       0.33      0.32      0.32      7007

    accuracy                           0.46     56062
   macro avg       0.46      0.46      0.45     56062
weighted avg       0.46      0.46      0.45     56062


LR Confusion Matrix (CV on Training Set):
        BPL   BPS   CHS   CPP   CTF  CTPID   CTS     N
BPL    2130  3350     5   564   348    133     7   471
BPS    1513  4871   233    55   206     50    80     0
CHS       0     0  5246   625   218    221   373   325
CPP       3    20   708  3736   485    



MLP Classification Report (CV on Training Set):
              precision    recall  f1-score   support

         BPL       0.60      0.38      0.47      7008
         BPS       0.62      0.75      0.68      7008
         CHS       0.99      0.99      0.99      7008
         CPP       0.75      0.78      0.76      7008
         CTF       0.82      0.82      0.82      7008
       CTPID       0.55      0.76      0.64      7007
         CTS       0.99      0.97      0.98      7008
           N       0.51      0.38      0.43      7007

    accuracy                           0.73     56062
   macro avg       0.73      0.73      0.72     56062
weighted avg       0.73      0.73      0.72     56062


MLP Confusion Matrix (CV on Training Set):
        BPL   BPS   CHS   CPP   CTF  CTPID   CTS     N
BPL    2674  3232     0    94    13    831     2   162
BPS    1737  5257     0     3     5      4     2     0
CHS       0     0  6933    18    15      7     6    29
CPP       7     0     7  5447   246  

  y = column_or_1d(y, warn=True)


CatBoost Classification Report (CV on Training Set):
              precision    recall  f1-score   support

         BPL       0.80      0.32      0.46      7008
         BPS       0.61      0.92      0.74      7008
         CHS       1.00      0.99      0.99      7008
         CPP       0.97      0.95      0.96      7008
         CTF       0.81      0.75      0.77      7008
       CTPID       0.69      0.59      0.64      7007
         CTS       0.99      0.96      0.97      7008
           N       0.51      0.69      0.59      7007

    accuracy                           0.77     56062
   macro avg       0.80      0.77      0.77     56062
weighted avg       0.80      0.77      0.77     56062


CatBoost Confusion Matrix (CV on Training Set):
        BPL   BPS   CHS   CPP   CTF  CTPID   CTS     N
BPL    2267  4047     0    23     5    261     0   405
BPS     549  6453     0     2     1      1     1     1
CHS       0     0  6935     3    22     11     6    31
CPP       0     0     8  66

In [6]:
# Baseline Model Classification Report Output!

models = {
    "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop through the models
f1_scores = {}
for model_name, model in models.items():
    # Create a pipeline with scaling and classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Cross-validated predictions
    y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=cv, n_jobs=-1)
    
    # Decode predictions and true labels
    y_pred_decoded = label_encoder.inverse_transform(y_pred_cv)
    y_true_decoded = label_encoder.inverse_transform(y_train)
    
    # Classification Report
    report = classification_report(y_true_decoded, y_pred_decoded)
    print(f"{model_name} Classification Report (CV on Training Set):\n{report}\n")
    
    # F1 score (macro average)
    f1_score_value = f1_score(y_true_decoded, y_pred_decoded, average='macro')
    f1_scores[model_name] = f1_score_value
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true_decoded, y_pred_decoded)
    target_names = label_encoder.classes_
    conf_matrix_df = pd.DataFrame(conf_matrix, index=target_names, columns=target_names)
    print(f"{model_name} Confusion Matrix (CV on Training Set):\n{conf_matrix_df}\n")

# Print F1 scores
print("F1 Scores (Cross-Validation on Training Set):")
for model_name, score in f1_scores.items():
    print(f"{model_name}: {score:.4f}")

# Convert classification report to DataFrame
report_dict = classification_report(y_true_decoded, y_pred_decoded, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Save to Excel
with pd.ExcelWriter("classification_report_Baseline(CV).xlsx") as writer:
    report_df.to_excel(writer, sheet_name="Classification Report")
    conf_matrix_df.to_excel(writer, sheet_name="Confusion Matrix")

print("Saved!")

LightGBM Classification Report (CV on Training Set):
              precision    recall  f1-score   support

         BPL       0.76      0.35      0.48      7008
         BPS       0.61      0.90      0.73      7008
         CHS       1.00      0.99      1.00      7008
         CPP       0.99      0.98      0.99      7008
         CTF       0.81      0.80      0.81      7008
       CTPID       0.76      0.62      0.68      7007
         CTS       0.99      0.96      0.97      7008
           N       0.55      0.72      0.62      7007

    accuracy                           0.79     56062
   macro avg       0.81      0.79      0.78     56062
weighted avg       0.81      0.79      0.78     56062


LightGBM Confusion Matrix (CV on Training Set):
        BPL   BPS   CHS   CPP   CTF  CTPID   CTS     N
BPL    2482  3966     0     2     2    191     0   365
BPS     711  6292     0     1     0      3     0     1
CHS       0     0  6968     1    11      1     0    27
CPP       2     0     0  68

### Hyper-Parameter for Optimal Baseline Model

In [None]:
lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1)

# pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', lgbm_model)
])

param_grid = {
    'model__num_leaves': [31, 50, 100],  # Default: 31
    'model__learning_rate': [0.1, 0.2],  # Default: 0.1
    'model__n_estimators': [100, 200, 300],  # Default: 100
    'model__min_child_samples': [10, 20, 30],  # Default: 20
    'model__subsample': [0.6, 0.8, 1.0],  # Default: 1.0
    'model__colsample_bytree': [0.6, 0.8, 1.0],  # Default: 1.0
    'model__reg_alpha': [0.0, 0.3],  # Default: 0.0
    'model__reg_lambda': [0.0, 0.3],  # Default: 0.0
}

# StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV with pipeline
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1_macro', cv=skf, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

# GridSearch Results in Descending Order
cv_results = grid_search.cv_results_
cv_results_df = pd.DataFrame(cv_results)
cv_results_df_sorted = cv_results_df.sort_values(by='mean_test_score', ascending=False)

cv_results_df_sorted

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits
Best Parameters: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__min_child_samples': 20, 'model__n_estimators': 100, 'model__num_leaves': 31, 'model__reg_alpha': 0.0, 'model__reg_lambda': 0.0, 'model__subsample': 0.6}
Best Cross-Validation Score: 0.793907369770017


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__colsample_bytree,param_model__learning_rate,param_model__min_child_samples,param_model__n_estimators,param_model__num_leaves,param_model__reg_alpha,...,param_model__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1404,30.352405,1.452365,0.309054,0.008984,1.0,0.1,20,100,31,0.0,...,0.6,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785121,0.795698,0.792991,0.795729,0.799997,0.793907,0.004933,1
1406,30.352870,0.958223,0.309782,0.006166,1.0,0.1,20,100,31,0.0,...,1.0,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785121,0.795698,0.792991,0.795729,0.799997,0.793907,0.004933,1
1405,30.491282,1.078479,0.313603,0.002032,1.0,0.1,20,100,31,0.0,...,0.8,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785121,0.795698,0.792991,0.795729,0.799997,0.793907,0.004933,1
1306,31.192203,0.272369,0.306379,0.009647,1.0,0.1,10,100,31,0.3,...,0.8,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785718,0.795699,0.794687,0.795744,0.796635,0.793697,0.004037,4
1307,30.776328,0.670176,0.298626,0.007173,1.0,0.1,10,100,31,0.3,...,1.0,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785718,0.795699,0.794687,0.795744,0.796635,0.793697,0.004037,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1287,124.280025,3.731716,2.832744,0.035685,0.8,0.2,30,300,100,0.0,...,0.6,"{'model__colsample_bytree': 0.8, 'model__learn...",0.770101,0.770286,0.773962,0.776778,0.774352,0.773096,0.002559,1939
1289,125.214385,2.718746,2.791427,0.016798,0.8,0.2,30,300,100,0.0,...,1.0,"{'model__colsample_bytree': 0.8, 'model__learn...",0.770101,0.770286,0.773962,0.776778,0.774352,0.773096,0.002559,1939
1285,115.348016,3.764505,3.531481,1.438094,0.8,0.2,30,300,100,0.0,...,0.8,"{'model__colsample_bytree': 0.8, 'model__learn...",0.771811,0.768889,0.774200,0.776798,0.773344,0.773008,0.002619,1942
1286,114.455811,2.763382,2.328521,0.010040,0.8,0.2,30,300,100,0.0,...,1.0,"{'model__colsample_bytree': 0.8, 'model__learn...",0.771811,0.768889,0.774200,0.776798,0.773344,0.773008,0.002619,1942


### Tuned Optimal Baseline Model for Testing Dataset

In [None]:
# model with hyper-parameter
paras = {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.6}
lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1, **paras)

# pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler for feature scaling
    ('model', lgbm_model)           # LGBMClassifier model
])

# Train on training data
pipeline.fit(X_train, y_train)

# Evaluation on testing dataaet
y_test_pred = pipeline.predict(X_test)

# Classification Report
y_test_pred_decoded = label_encoder.inverse_transform(y_test_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)
report = classification_report(y_test_decoded, y_test_pred_decoded, target_names=label_encoder.classes_, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(f"Classification Report:\n{report_df}\n")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test_decoded, y_test_pred_decoded)
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
print(f"Confusion Matrix:\n{conf_matrix_df}\n")

Classification Report:
              precision    recall  f1-score       support
BPL            0.743652  0.351027  0.476929   1752.000000
BPS            0.605645  0.881849  0.718104   1752.000000
CHS            0.997714  0.996575  0.997144   1752.000000
CPP            0.987371  0.981735  0.984545   1752.000000
CTF            0.859514  0.827626  0.843268   1752.000000
CTPID          0.791170  0.644406  0.710286   1752.000000
CTS            0.987356  0.980594  0.983963   1752.000000
N              0.562827  0.736301  0.637982   1752.000000
accuracy       0.800014  0.800014  0.800014      0.800014
macro avg      0.816906  0.800014  0.794028  14016.000000
weighted avg   0.816906  0.800014  0.794028  14016.000000

Confusion Matrix:
       BPL   BPS   CHS   CPP   CTF  CTPID   CTS     N
BPL    615  1003     0     1     1     49     1    82
BPS    206  1545     0     0     0      0     1     0
CHS      0     0  1746     0     0      1     0     5
CPP      0     0     0  1720     2      1     

### Saving Reports

In [6]:
with pd.ExcelWriter("classification_report_Baseline.xlsx") as writer:
    report_df.to_excel(writer, sheet_name="Classification Report")
    conf_matrix_df.to_excel(writer, sheet_name="Confusion Matrix")

print("Seved!")

Seved!
