**Binary Classification**

In [50]:
import pandas as pd
from xgboost import XGBClassifier
train_data = pd.read_csv('modified_data_train.csv')
test_data  = pd.read_csv('modified_data_test.csv')
train_data['binary_fault'] = train_data['faultNumber'].apply(lambda x: 0 if x == 0 else 1)
test_data['binary_fault'] = test_data['faultNumber'].apply(lambda x: 0 if x == 0 else 1)
drop_cols = ['faultNumber', 'simulationRun', 'sample','binary_fault']
X_train = train_data.drop(columns=drop_cols, errors='ignore')
y_train = train_data['binary_fault']
X_test = test_data.drop(columns=drop_cols, errors='ignore')
y_test = test_data['binary_fault']
common_cols = sorted(list(set(X_train.columns).intersection(set(X_test.columns))))
X_train = X_train[common_cols]
X_test  = X_test[common_cols]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [51]:
y_train.value_counts()

Unnamed: 0_level_0,count
binary_fault,Unnamed: 1_level_1
1,24480
0,1440


In [52]:
model=XGBClassifier(n_estimators=100)

In [53]:
model.fit(X_train,y_train)

In [54]:
y_pred=model.predict(X_test)

In [55]:
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9269700551615445
Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.08      0.10      2820
           1       0.95      0.98      0.96     47940

    accuracy                           0.93     50760
   macro avg       0.56      0.53      0.53     50760
weighted avg       0.90      0.93      0.91     50760



Classic example of overfitting as the data is quite imbalanced, let us try addressing the imbalance problem by oversampling the minority class.

In [56]:
from imblearn.over_sampling import RandomOverSampler
RO=RandomOverSampler(random_state=42)
X_train_ro, y_train_ro=  RO.fit_resample(X_train,y_train)
model=XGBClassifier(n_estimators=100)

In [57]:
model.fit(X_train_ro,y_train_ro)

In [58]:
y_pred=model.predict(X_test)

In [59]:
accuracy = accuracy_score(y_test, y_pred)
print('Test Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))

Test Accuracy: 0.823187549251379
Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.65      0.29      2820
           1       0.98      0.83      0.90     47940

    accuracy                           0.82     50760
   macro avg       0.58      0.74      0.60     50760
weighted avg       0.93      0.82      0.87     50760



In [60]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
model=XGBClassifier(n_estimators=100)
model.fit(X_train_resampled, y_train_resampled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.19      0.65      0.29      2820
           1       0.98      0.83      0.90     47940

    accuracy                           0.82     50760
   macro avg       0.58      0.74      0.60     50760
weighted avg       0.93      0.82      0.87     50760



In [61]:
from imblearn.under_sampling import RandomUnderSampler
RU=RandomUnderSampler(random_state=42)
X_train_ru,y_train_ru=RU.fit_resample(X_train,y_train)
model=XGBClassifier(n_estimators=100)
model.fit(X_train_ru,y_train_ru)
y_pred=model.predict(X_test)

In [62]:
accuracy = accuracy_score(y_test, y_pred)
print('Test Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))

Test Accuracy: 0.6900512214342002
Classification Report:
              precision    recall  f1-score   support

           0       0.15      0.97      0.26      2820
           1       1.00      0.67      0.80     47940

    accuracy                           0.69     50760
   macro avg       0.57      0.82      0.53     50760
weighted avg       0.95      0.69      0.77     50760



**Multiclass Classification**

In [63]:
X_train = train_data.drop(columns=drop_cols, errors='ignore')
y_train = train_data['faultNumber']

In [64]:
y_train.value_counts()

Unnamed: 0_level_0,count
faultNumber,Unnamed: 1_level_1
0.0,1440
1.0,1440
19.0,1440
18.0,1440
17.0,1440
16.0,1440
14.0,1440
13.0,1440
12.0,1440
11.0,1440


Now, this a balanced dataset where each class has almost same number of points

In [65]:
X_test =test_data.drop(columns=drop_cols, errors='ignore')
y_test = test_data['faultNumber']

In [66]:
y_test.value_counts()

Unnamed: 0_level_0,count
faultNumber,Unnamed: 1_level_1
0,2820
1,2820
19,2820
18,2820
17,2820
16,2820
14,2820
13,2820
12,2820
11,2820


In [67]:
common_cols = sorted(list(set(X_train.columns).intersection(set(X_test.columns))))

In [68]:
X_train = X_train[common_cols]
X_test  = X_test[common_cols]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [69]:
import numpy as np

print("Unique classes in y_train:", np.unique(y_train))
print("Unique classes in y_test:", np.unique(y_test))

Unique classes in y_train: [ 0  1  2  4  5  6  7  8 10 11 12 13 14 16 17 18 19 20]
Unique classes in y_test: [ 0  1  2  4  5  6  7  8 10 11 12 13 14 16 17 18 19 20]


In [70]:
model=XGBClassifier(n_estinators=100)
model.fit(X_train,y_train)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17], got [ 0  1  2  4  5  6  7  8 10 11 12 13 14 16 17 18 19 20]

The error occurs as XGBoost is expecting values in y from 0 to 17 but here in our y_train and y_test have 3,9 and 15 missing. Thus, we need LabelEncoder

In [71]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)  # Use same encoding for test data

print("New unique classes in y_train:", np.unique(y_train_encoded))
print("New unique classes in y_test:", np.unique(y_test_encoded))


New unique classes in y_train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
New unique classes in y_test: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]


Now let us keep one thing in focus that mapping of fault number is different than y_train_encoded and y_test_encoded. Here, [ 0  1  2  4  5  6  7  8 10 11 12 13 14 16 17 18 19 20] is now [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]. Now, let's train the model

In [72]:
model = XGBClassifier(n_estimators=100)
model.fit(X_train, y_train_encoded)

In [73]:
y_pred=model.predict(X_test)
y_pred_original = encoder.inverse_transform(y_pred)

In [74]:
accuracy = accuracy_score(y_test, y_pred_original)
print('Test Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred_original))

Test Accuracy: 0.6746650906225374
Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.59      0.29      2820
           1       0.95      0.84      0.89      2820
           2       1.00      0.84      0.91      2820
           4       0.94      0.84      0.89      2820
           5       0.28      0.26      0.27      2820
           6       1.00      0.85      0.92      2820
           7       1.00      0.85      0.92      2820
           8       0.92      0.65      0.76      2820
          10       0.70      0.45      0.54      2820
          11       0.85      0.70      0.77      2820
          12       0.69      0.52      0.59      2820
          13       0.95      0.65      0.77      2820
          14       0.98      0.83      0.90      2820
          16       0.29      0.41      0.34      2820
          17       0.94      0.78      0.85      2820
          18       0.88      0.78      0.82      2820
          19       0.68 

Here we can see a decent test accuracy,precision and f1-score.

In [76]:
import numpy as np

# Ensure exactly 37 features (removed one value)
new_data = np.array([[12.5, 4.3, 18.6, 22.1, 7.9, 5.5, 16.2, 3.8, 11.9, 14.0,
                      20.3, 9.8, 13.1, 17.5, 15.6, 19.2, 10.4, 6.7, 8.9, 12.3,
                      11.5, 14.8, 16.9, 7.3, 9.5, 15.0, 18.2, 19.8, 13.9, 10.7,
                      16.1, 12.2, 7.8, 14.4, 11.0, 13.7, 17.3]])  # Now 37 values


# Predict
fault_prediction1 = model.predict(new_data)
fault_prediction = encoder.inverse_transform(fault_prediction1)
# Result
if fault_prediction[0] == 0:
    print("The data point is Non-Faulty ")
else:
    print(f"The data point is Faulty  with Fault Number: {fault_prediction[0]}")

The data point is Faulty  with Fault Number: 17
