In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")

In [10]:
df = pd.read_csv('../data/processed_diabetes_data.csv').drop(columns=["Unnamed: 0"], axis=1)
failure_counts = train['Machine failure'].value_counts()

# Print the counts
print("Occurrences of Machine Failure:")
print("Value 0:", failure_counts[0])
print("Value 1:", failure_counts[1])

X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values
X_test = test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


(136429, 264)
(90954, 263)
Occurrences of Machine Failure:
Value 0: 134281
Value 1: 2148


In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [12]:
rfc= RandomForestClassifier(random_state=42)
rfc.fit(X_train_pca, y_train)
rfc_predictions = rfc.predict(X_test_pca) 
print(classification_report(y_val, rfc_predictions))
print(confusion_matrix(y_val, rfc_predictions))

report = classification_report(y_val, rfc_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, rfc_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'RFC',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

metrics_df = pd.DataFrame(data)
print(metrics_df)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     26848
           1       0.99      0.76      0.86       438

    accuracy                           1.00     27286
   macro avg       0.99      0.88      0.93     27286
weighted avg       1.00      1.00      1.00     27286

[[26843     5]
 [  104   334]]
  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0                RFC   0.995966  0.996005  0.995754  0.996005   

           Confusion Matrix  
0  [[26843, 5], [104, 334]]  


In [13]:
X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values
X_test = test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [14]:
gbt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
gbt.fit(X_train_pca,y_train)
gbt_predictions = gbt.predict(X_test_pca) 
print(classification_report(y_val, gbt_predictions))
print(confusion_matrix(y_val, gbt_predictions))

report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'GBT',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)


              precision    recall  f1-score   support

           0       1.00      0.99      1.00     26848
           1       0.67      0.77      0.72       438

    accuracy                           0.99     27286
   macro avg       0.83      0.88      0.86     27286
weighted avg       0.99      0.99      0.99     27286

[[26685   163]
 [  102   336]]
  Sampling Technique  Precision    Recall  F1 Score  Accuracy  \
0                RFC   0.995966  0.996005  0.995754  0.996005   
1                GBT   0.991010  0.990288  0.990599  0.990288   

             Confusion Matrix  
0    [[26843, 5], [104, 334]]  
1  [[26685, 163], [102, 336]]  


In [15]:
X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values
X_test = test

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)

pca = PCA(n_components=125)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [16]:
neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
params = {
    'objective': 'binary',
    'metric': 'mae',
    'min_child_weight': 60,  # Adjust this value and experiment
    'random_state': 42,
    'max_delta_step': 1,
    'verbose': -1,
    'max_depth': 10, 
}

train_data = lgb.Dataset(X_train_pca, label=y_train)
val_data = lgb.Dataset(X_test_pca, label=y_val)

num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

y_pred_proba = bst.predict(X_test_pca)

y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

print(classification_report(y_val, y_pred_class, zero_division=0))
print(confusion_matrix(y_val, y_pred_class))

report = classification_report(y_val, gbt_predictions, output_dict=True)
conf_matrix = confusion_matrix(y_val, gbt_predictions)

# Extract metrics from classification report
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a DataFrame to store the metrics
data = {
    'Sampling Technique' : 'LGBM',
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score,
    'Accuracy': accuracy,
    'Confusion Matrix': [conf_matrix]
}

df = pd.DataFrame(data)
metrics_df = metrics_df.append(df, ignore_index=True)
print(metrics_df)

metrics_df.to_csv('../data/pcaResults.csv', header=1)

[1]	valid_0's l1: 0.0300821
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 0.0290351
[3]	valid_0's l1: 0.0281168
[4]	valid_0's l1: 0.0272662
[5]	valid_0's l1: 0.0264926
[6]	valid_0's l1: 0.0257964
[7]	valid_0's l1: 0.0251694
[8]	valid_0's l1: 0.0245655
[9]	valid_0's l1: 0.0240188
[10]	valid_0's l1: 0.0235193
[11]	valid_0's l1: 0.0230526
[12]	valid_0's l1: 0.0226225
[13]	valid_0's l1: 0.0222268
[14]	valid_0's l1: 0.0218596
[15]	valid_0's l1: 0.0215185
[16]	valid_0's l1: 0.0211974
[17]	valid_0's l1: 0.0208967
[18]	valid_0's l1: 0.0206122
[19]	valid_0's l1: 0.0203417
[20]	valid_0's l1: 0.020084
[21]	valid_0's l1: 0.0198187
[22]	valid_0's l1: 0.0195578
[23]	valid_0's l1: 0.0193001
[24]	valid_0's l1: 0.0190447
[25]	valid_0's l1: 0.0187892
[26]	valid_0's l1: 0.0185328
[27]	valid_0's l1: 0.0182726
[28]	valid_0's l1: 0.018008
[29]	valid_0's l1: 0.0177382
[30]	valid_0's l1: 0.0174679
[31]	valid_0's l1: 0.0171876
[32]	valid_0's l1: 0.0169029
[33]	valid_0's l1: 0.0