In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from scipy.stats import loguniform
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_val_predict
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

### Read Dataset

In [2]:
# Read the Data Set
path = '/Users/lks/Desktop/UM/Project Report/01_LBNL_FDD_Even/df_faults_even.xlsx'
path2 = '/Users/lks/Desktop/UM/Project Report/01_LBNL_FDD_Even/df_faults_woF.xlsx'

DF_faulty = pd.read_excel(path, index_col=0) # Engineered Feature Set
DF_faulty2 = pd.read_excel(path2, index_col=0) # Original Feature Set


df_filtered = DF_faulty # Switch to DF_faulty2 if needed

In [4]:
# Prepare the data
X = df_filtered.drop(columns=['Datetime', 'Fault'])
y = df_filtered['Fault']

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


### Multiple Classifiers with Cross-Validation

In [None]:
# Define models
models = {
    # Statistical Method
    "LR": LogisticRegression(random_state=42),
    "Ridge": RidgeClassifier(random_state=42),
    # Other Algorithms
    "KNN": KNeighborsClassifier(n_jobs=-1),
    "SVM": SVC(random_state=42),
    "SGD": SGDClassifier(random_state=42),
    # Neural Network-Based Model
    "MLP": MLPClassifier(random_state=42),
    # Tree-Based Model
    "DT": DecisionTreeClassifier(random_state=42),
    "RF": RandomForestClassifier(random_state=42, n_jobs=-1),
    "GB": GradientBoostingClassifier(random_state=42),
    "XGB": XGBClassifier(random_state=42, n_jobs=-1),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0),
    "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1),
}

# Loop through the models
f1_scores = {}
for model_name, model in models.items():
    # Create a pipeline that includes scaling and SMOTE
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)
    
    # Decode predictions back to original string classes
    y_pred_decoded = label_encoder.inverse_transform(y_pred)
    y_test_decoded = label_encoder.inverse_transform(y_test)
    
    # Classification Report
    report = classification_report(y_test_decoded, y_pred_decoded)
    print(f"{model_name} Classification Report:\n{report}\n")
    
    # Calculate and store F1-score
    f1_score_value = f1_score(y_test_decoded, y_pred_decoded, average='macro')
    f1_scores[model_name] = f1_score_value
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test_decoded, y_pred_decoded)
    target_names = label_encoder.classes_
    conf_matrix_df = pd.DataFrame(conf_matrix, index=target_names, columns=target_names)
    print(f"{model_name} Confusion Matrix:\n{conf_matrix_df}\n")

# Print F1 scores for each model
print("F1 Scores for each model:")
for model_name, score in f1_scores.items():
    print(f"{model_name}: {score:.4f}")

K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

         BPL       0.42      0.34      0.38      1752
         BPS       0.56      0.64      0.59      1752
         CHS       0.96      0.93      0.94      1752
         CPP       0.61      0.74      0.67      1752
         CTF       0.64      0.62      0.63      1752
       CTPID       0.47      0.56      0.51      1752
         CTS       0.93      0.72      0.81      1752
           N       0.29      0.26      0.27      1752

    accuracy                           0.60     14016
   macro avg       0.61      0.60      0.60     14016
weighted avg       0.61      0.60      0.60     14016


K-Nearest Neighbors Confusion Matrix:
       BPL   BPS   CHS   CPP   CTF  CTPID   CTS    N
BPL    604   894     0    23    12    153     0   66
BPS    623  1121     0     3     3      2     0    0
CHS      1     1  1623    55    26      4     3   39
CPP     50     0    17  1293    68    143    30  151
CT

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Gaussian Mixture Model Classification Report:
              precision    recall  f1-score   support

         BPL       0.12      1.00      0.22      1752
         BPS       0.00      0.00      0.00      1752
         CHS       0.00      0.00      0.00      1752
         CPP       0.00      0.00      0.00      1752
         CTF       0.00      0.00      0.00      1752
       CTPID       0.00      0.00      0.00      1752
         CTS       0.00      0.00      0.00      1752
           N       0.00      0.00      0.00      1752

    accuracy                           0.12     14016
   macro avg       0.02      0.12      0.03     14016
weighted avg       0.02      0.12      0.03     14016


Gaussian Mixture Model Confusion Matrix:
        BPL  BPS  CHS  CPP  CTF  CTPID  CTS  N
BPL    1752    0    0    0    0      0    0  0
BPS    1752    0    0    0    0      0    0  0
CHS    1752    0    0    0    0      0    0  0
CPP    1752    0    0    0    0      0    0  0
CTF    1752    0    0    0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Classification Report:
              precision    recall  f1-score   support

         BPL       0.56      0.32      0.40      1752
         BPS       0.57      0.70      0.63      1752
         CHS       0.50      0.74      0.60      1752
         CPP       0.43      0.52      0.47      1752
         CTF       0.50      0.59      0.54      1752
       CTPID       0.46      0.41      0.44      1752
         CTS       0.69      0.51      0.59      1752
           N       0.30      0.21      0.25      1752

    accuracy                           0.50     14016
   macro avg       0.50      0.50      0.49     14016
weighted avg       0.50      0.50      0.49     14016


Logistic Regression Confusion Matrix:
       BPL   BPS   CHS  CPP   CTF  CTPID  CTS    N
BPL    557   855    42  121    43     61    3   70
BPS    408  1220    41   14    13      5   51    0
CHS      0     0  1299  111   102     54   80  106
CPP      6    15   278  912   164    121   54  202
CTF      1  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


GNB Classification Report:
              precision    recall  f1-score   support

         BPL       0.00      0.00      0.00      1752
         BPS       0.18      1.00      0.31      1752
         CHS       0.25      0.18      0.21      1752
         CPP       0.36      0.16      0.22      1752
         CTF       0.20      0.03      0.05      1752
       CTPID       0.70      0.23      0.35      1752
         CTS       0.15      0.10      0.12      1752
           N       0.33      0.06      0.10      1752

    accuracy                           0.22     14016
   macro avg       0.27      0.22      0.17     14016
weighted avg       0.27      0.22      0.17     14016


GNB Confusion Matrix:
       BPL   BPS  CHS  CPP  CTF  CTPID  CTS    N
BPL      0  1725    0    0    0     27    0    0
BPS      0  1752    0    0    0      0    0    0
CHS      0  1021  324  138   28     24  208    9
CPP      0  1033  196  274   17     24  201    7
CTF      0  1011  254  103   50     16  232   86
CTPID

  y = column_or_1d(y, warn=True)


Cat Classification Report:
              precision    recall  f1-score   support

         BPL       0.77      0.32      0.45      1752
         BPS       0.61      0.91      0.73      1752
         CHS       0.99      0.99      0.99      1752
         CPP       0.97      0.96      0.97      1752
         CTF       0.83      0.76      0.80      1752
       CTPID       0.72      0.60      0.65      1752
         CTS       0.99      0.97      0.98      1752
           N       0.51      0.70      0.59      1752

    accuracy                           0.78     14016
   macro avg       0.80      0.78      0.77     14016
weighted avg       0.80      0.78      0.77     14016


Cat Confusion Matrix:
       BPL   BPS   CHS   CPP   CTF  CTPID   CTS     N
BPL    554  1024     0     9     2     73     0    90
BPS    159  1592     0     1     0      0     0     0
CHS      0     0  1732     4     6      0     0    10
CPP      0     0     0  1687     5     12     3    45
CTF      1     0     8     5 



MLP Classification Report:
              precision    recall  f1-score   support

         BPL       0.61      0.46      0.53      1752
         BPS       0.63      0.70      0.67      1752
         CHS       1.00      0.99      0.99      1752
         CPP       0.80      0.71      0.75      1752
         CTF       0.80      0.89      0.84      1752
       CTPID       0.58      0.84      0.69      1752
         CTS       1.00      0.99      0.99      1752
           N       0.52      0.36      0.43      1752

    accuracy                           0.74     14016
   macro avg       0.74      0.74      0.74     14016
weighted avg       0.74      0.74      0.74     14016


MLP Confusion Matrix:
       BPL   BPS   CHS   CPP   CTF  CTPID   CTS    N
BPL    807   723     0    30     6    176     0   10
BPS    514  1234     0     3     1      0     0    0
CHS      0     0  1741     6     0      0     0    5
CPP      0     0     2  1241   125    112     0  272
CTF      0     0     0    16  1552



AdaBoost Classification Report:
              precision    recall  f1-score   support

         BPL       0.41      0.72      0.52      1752
         BPS       0.60      0.18      0.28      1752
         CHS       0.32      0.76      0.45      1752
         CPP       0.33      0.01      0.01      1752
         CTF       0.34      0.13      0.19      1752
       CTPID       0.19      0.54      0.28      1752
         CTS       0.16      0.04      0.07      1752
           N       0.24      0.01      0.02      1752

    accuracy                           0.30     14016
   macro avg       0.32      0.30      0.23     14016
weighted avg       0.32      0.30      0.23     14016


AdaBoost Confusion Matrix:
        BPL  BPS   CHS  CPP  CTF  CTPID  CTS   N
BPL    1255  177     5    0   20    291    3   1
BPS    1410  318     0    0    0     21    2   1
CHS     398    0  1340    0    1      2    7   4
CPP       0    0   581    9  108    944   95  15
CTF       0    0   584    4  231    848   78

### Hyper-Parameter for Optimal Baseline Model

In [None]:
lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1)

# pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', lgbm_model)
])

param_grid = {
    'model__num_leaves': [31, 50, 100],  # Default: 31
    'model__learning_rate': [0.1, 0.2],  # Default: 0.1
    'model__n_estimators': [100, 200, 300],  # Default: 100
    'model__min_child_samples': [10, 20, 30],  # Default: 20
    'model__subsample': [0.6, 0.8, 1.0],  # Default: 1.0
    'model__colsample_bytree': [0.6, 0.8, 1.0],  # Default: 1.0
    'model__reg_alpha': [0.0, 0.3],  # Default: 0.0
    'model__reg_lambda': [0.0, 0.3],  # Default: 0.0
}

# StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV with pipeline
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1_macro', cv=skf, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

# GridSearch Results in Descending Order
cv_results = grid_search.cv_results_
cv_results_df = pd.DataFrame(cv_results)
cv_results_df_sorted = cv_results_df.sort_values(by='mean_test_score', ascending=False)

cv_results_df_sorted

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits
Best Parameters: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__min_child_samples': 20, 'model__n_estimators': 100, 'model__num_leaves': 31, 'model__reg_alpha': 0.0, 'model__reg_lambda': 0.0, 'model__subsample': 0.6}
Best Cross-Validation Score: 0.793907369770017


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__colsample_bytree,param_model__learning_rate,param_model__min_child_samples,param_model__n_estimators,param_model__num_leaves,param_model__reg_alpha,...,param_model__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1404,30.352405,1.452365,0.309054,0.008984,1.0,0.1,20,100,31,0.0,...,0.6,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785121,0.795698,0.792991,0.795729,0.799997,0.793907,0.004933,1
1406,30.352870,0.958223,0.309782,0.006166,1.0,0.1,20,100,31,0.0,...,1.0,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785121,0.795698,0.792991,0.795729,0.799997,0.793907,0.004933,1
1405,30.491282,1.078479,0.313603,0.002032,1.0,0.1,20,100,31,0.0,...,0.8,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785121,0.795698,0.792991,0.795729,0.799997,0.793907,0.004933,1
1306,31.192203,0.272369,0.306379,0.009647,1.0,0.1,10,100,31,0.3,...,0.8,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785718,0.795699,0.794687,0.795744,0.796635,0.793697,0.004037,4
1307,30.776328,0.670176,0.298626,0.007173,1.0,0.1,10,100,31,0.3,...,1.0,"{'model__colsample_bytree': 1.0, 'model__learn...",0.785718,0.795699,0.794687,0.795744,0.796635,0.793697,0.004037,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1287,124.280025,3.731716,2.832744,0.035685,0.8,0.2,30,300,100,0.0,...,0.6,"{'model__colsample_bytree': 0.8, 'model__learn...",0.770101,0.770286,0.773962,0.776778,0.774352,0.773096,0.002559,1939
1289,125.214385,2.718746,2.791427,0.016798,0.8,0.2,30,300,100,0.0,...,1.0,"{'model__colsample_bytree': 0.8, 'model__learn...",0.770101,0.770286,0.773962,0.776778,0.774352,0.773096,0.002559,1939
1285,115.348016,3.764505,3.531481,1.438094,0.8,0.2,30,300,100,0.0,...,0.8,"{'model__colsample_bytree': 0.8, 'model__learn...",0.771811,0.768889,0.774200,0.776798,0.773344,0.773008,0.002619,1942
1286,114.455811,2.763382,2.328521,0.010040,0.8,0.2,30,300,100,0.0,...,1.0,"{'model__colsample_bytree': 0.8, 'model__learn...",0.771811,0.768889,0.774200,0.776798,0.773344,0.773008,0.002619,1942


### Tuned Optimal Baseline Model

In [5]:
# model with hyper-parameter
paras = {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.6}
lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1, verbosity=-1, **paras)

# pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # StandardScaler for feature scaling
    ('model', lgbm_model)           # LGBMClassifier model
])

# Train on training data
pipeline.fit(X_train, y_train)

# Evaluation on testing dataaet
y_test_pred = pipeline.predict(X_test)

# Classification Report
y_test_pred_decoded = label_encoder.inverse_transform(y_test_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)
report = classification_report(y_test_decoded, y_test_pred_decoded, target_names=label_encoder.classes_, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(f"Classification Report:\n{report_df}\n")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test_decoded, y_test_pred_decoded)
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
print(f"Confusion Matrix:\n{conf_matrix_df}\n")

Classification Report:
              precision    recall  f1-score       support
BPL            0.743652  0.351027  0.476929   1752.000000
BPS            0.605645  0.881849  0.718104   1752.000000
CHS            0.997714  0.996575  0.997144   1752.000000
CPP            0.987371  0.981735  0.984545   1752.000000
CTF            0.859514  0.827626  0.843268   1752.000000
CTPID          0.791170  0.644406  0.710286   1752.000000
CTS            0.987356  0.980594  0.983963   1752.000000
N              0.562827  0.736301  0.637982   1752.000000
accuracy       0.800014  0.800014  0.800014      0.800014
macro avg      0.816906  0.800014  0.794028  14016.000000
weighted avg   0.816906  0.800014  0.794028  14016.000000

Confusion Matrix:
       BPL   BPS   CHS   CPP   CTF  CTPID   CTS     N
BPL    615  1003     0     1     1     49     1    82
BPS    206  1545     0     0     0      0     1     0
CHS      0     0  1746     0     0      1     0     5
CPP      0     0     0  1720     2      1     

### Saving Reports

In [6]:
with pd.ExcelWriter("classification_report_Baseline.xlsx") as writer:
    report_df.to_excel(writer, sheet_name="Classification Report")
    conf_matrix_df.to_excel(writer, sheet_name="Confusion Matrix")

print("Seved!")

Seved!
