In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from scipy import stats

# Prepare data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
test_ids = test_data['id']

# Preprocess data
scaler = StandardScaler()
le = LabelEncoder()
train_data.drop_duplicates(inplace=True)

# Detect and handle outliers using Z-score method
def remove_outliers_zscore(data, threshold=3):
    z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
    return data[(z_scores < threshold).all(axis=1)]

train_data = remove_outliers_zscore(train_data)

# Define X and y
X = train_data.drop(["Fault Type"], axis=1)
y = train_data["Fault Type"]
y = le.fit_transform(y)
y = to_categorical(y)

# Handle class imbalance with RandomOverSampler
ros = RandomOverSampler(random_state=42)
X, y = ros.fit_resample(X, np.argmax(y, axis=1))
y = to_categorical(y)

# Scale features
X = scaler.fit_transform(X)
test_data = scaler.transform(test_data)

def create_advanced_model():
    model = Sequential()
    model.add(Dense(256, input_dim=X.shape[1], activation='relu', kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define parameter grids
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 9]
}

# Create RandomizedSearchCV objects
rf_random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=rf_param_grid, cv=3, n_iter=10, scoring='accuracy', verbose=2, random_state=42, n_jobs=-1)
xgb_random_search = RandomizedSearchCV(XGBClassifier(random_state=42), param_distributions=xgb_param_grid, cv=3, n_iter=10, scoring='accuracy', verbose=2, random_state=42, n_jobs=-1)

# Number of folds
n_splits = 3
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Cross-validation loop
nn_accuracies = []
rf_accuracies = []
xgb_accuracies = []
stacked_accuracies = []

for train_index, val_index in kf.split(X, np.argmax(y, axis=1)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Neural Network
    model = create_advanced_model()
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)
    model_save = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_accuracy', mode='max')

    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), callbacks=[model_save, early_stopping, reduce_lr], verbose=0)

    # Evaluate NN model
    y_pred_proba = model.predict(X_val)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_val_labels = np.argmax(y_val, axis=1)
    nn_accuracy = accuracy_score(y_val_labels, y_pred)
    nn_accuracies.append(nn_accuracy)

    # Random Forest
    rf_random_search.fit(X_train, np.argmax(y_train, axis=1))
    best_rf_model = rf_random_search.best_estimator_
    y_pred_rf = best_rf_model.predict(X_val)
    rf_accuracy = accuracy_score(np.argmax(y_val, axis=1), y_pred_rf)
    rf_accuracies.append(rf_accuracy)

    # XGBoost
    xgb_random_search.fit(X_train, np.argmax(y_train, axis=1))
    best_xgb_model = xgb_random_search.best_estimator_
    y_pred_xgb = best_xgb_model.predict(X_val)
    xgb_accuracy = accuracy_score(np.argmax(y_val, axis=1), y_pred_xgb)
    xgb_accuracies.append(xgb_accuracy)

    # Stacked Model
    nn_predictions = model.predict(X_val)
    rf_predictions = best_rf_model.predict_proba(X_val)
    xgb_predictions = best_xgb_model.predict_proba(X_val)
    final_predictions = (nn_predictions + rf_predictions + xgb_predictions) / 3
    y_pred_stacked = np.argmax(final_predictions, axis=1)
    stacked_accuracy = accuracy_score(y_val_labels, y_pred_stacked)
    stacked_accuracies.append(stacked_accuracy)

print(f'Neural Network Accuracy: {np.mean(nn_accuracies):.4f} ± {np.std(nn_accuracies):.4f}')
print(f'Random Forest Accuracy: {np.mean(rf_accuracies):.4f} ± {np.std(rf_accuracies):.4f}')
print(f'XGBoost Accuracy: {np.mean(xgb_accuracies):.4f} ± {np.std(xgb_accuracies):.4f}')
print(f'Stacked Model Accuracy: {np.mean(stacked_accuracies):.4f} ± {np.std(stacked_accuracies):.4f}')

# Train final models on full training data
final_nn_model = create_advanced_model()
final_nn_model.fit(X, y, epochs=50, batch_size=32, verbose=2, callbacks=[early_stopping, reduce_lr])

rf_random_search.fit(X, np.argmax(y, axis=1))
final_rf_model = rf_random_search.best_estimator_

xgb_random_search.fit(X, np.argmax(y, axis=1))
final_xgb_model = xgb_random_search.best_estimator_

# Predict on test data
nn_predictions = final_nn_model.predict(test_data)
rf_predictions = final_rf_model.predict_proba(test_data)
xgb_predictions = final_xgb_model.predict_proba(test_data)

# Average the predictions
final_predictions = (nn_predictions + rf_predictions + xgb_predictions) / 3
y_pred_labels = np.argmax(final_predictions, axis=1)
y_pred_labels_inv = le.inverse_transform(y_pred_labels)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'Fault Type': y_pred_labels_inv
})

# Save submission
submission.to_csv('Sample_sub.csv', index=False)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


  pid = os.fork()
  pid = os.fork()


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Neural Network Accuracy: 0.9495 ± 0.0047
Random Forest Accuracy: 0.9912 ± 0.0047
XGBoost Accuracy: 0.9949 ± 0.0071
Stacked Model Accuracy: 0.9975 ± 0.0036
Epoch 1/50




25/25 - 2s - loss: 2.2862 - accuracy: 0.3396 - lr: 0.0010 - 2s/epoch - 75ms/step
Epoch 2/50




25/25 - 0s - loss: 1.6131 - accuracy: 0.4697 - lr: 0.0010 - 81ms/epoch - 3ms/step
Epoch 3/50




25/25 - 0s - loss: 1.4080 - accuracy: 0.5556 - lr: 0.0010 - 86ms/epoch - 3ms/step
Epoch 4/50




25/25 - 0s - loss: 1.3222 - accuracy: 0.5758 - lr: 0.0010 - 89ms/epoch - 4ms/step
Epoch 5/50




25/25 - 0s - loss: 1.1767 - accuracy: 0.6263 - lr: 0.0010 - 82ms/epoch - 3ms/step
Epoch 6/50




25/25 - 0s - loss: 1.0821 - accuracy: 0.6730 - lr: 0.0010 - 82ms/epoch - 3ms/step
Epoch 7/50




25/25 - 0s - loss: 1.0481 - accuracy: 0.6755 - lr: 0.0010 - 98ms/epoch - 4ms/step
Epoch 8/50




25/25 - 0s - loss: 1.0017 - accuracy: 0.7020 - lr: 0.0010 - 94ms/epoch - 4ms/step
Epoch 9/50




25/25 - 0s - loss: 0.9450 - accuracy: 0.6944 - lr: 0.0010 - 91ms/epoch - 4ms/step
Epoch 10/50




25/25 - 0s - loss: 0.9362 - accuracy: 0.7273 - lr: 0.0010 - 95ms/epoch - 4ms/step
Epoch 11/50




25/25 - 0s - loss: 0.9382 - accuracy: 0.7298 - lr: 0.0010 - 83ms/epoch - 3ms/step
Epoch 12/50




25/25 - 0s - loss: 0.8265 - accuracy: 0.7841 - lr: 0.0010 - 90ms/epoch - 4ms/step
Epoch 13/50




25/25 - 0s - loss: 0.8383 - accuracy: 0.7740 - lr: 0.0010 - 87ms/epoch - 3ms/step
Epoch 14/50




25/25 - 0s - loss: 0.8114 - accuracy: 0.7904 - lr: 0.0010 - 94ms/epoch - 4ms/step
Epoch 15/50




25/25 - 0s - loss: 0.7595 - accuracy: 0.7967 - lr: 0.0010 - 89ms/epoch - 4ms/step
Epoch 16/50




25/25 - 0s - loss: 0.8166 - accuracy: 0.7841 - lr: 0.0010 - 85ms/epoch - 3ms/step
Epoch 17/50




25/25 - 0s - loss: 0.7933 - accuracy: 0.8018 - lr: 0.0010 - 99ms/epoch - 4ms/step
Epoch 18/50




25/25 - 0s - loss: 0.7674 - accuracy: 0.8005 - lr: 0.0010 - 124ms/epoch - 5ms/step
Epoch 19/50




25/25 - 0s - loss: 0.8131 - accuracy: 0.7790 - lr: 0.0010 - 295ms/epoch - 12ms/step
Epoch 20/50




25/25 - 0s - loss: 0.7021 - accuracy: 0.8043 - lr: 0.0010 - 202ms/epoch - 8ms/step
Epoch 21/50




25/25 - 0s - loss: 0.7448 - accuracy: 0.8093 - lr: 0.0010 - 124ms/epoch - 5ms/step
Epoch 22/50




25/25 - 0s - loss: 0.6704 - accuracy: 0.8409 - lr: 0.0010 - 82ms/epoch - 3ms/step
Epoch 23/50




25/25 - 0s - loss: 0.6783 - accuracy: 0.8384 - lr: 0.0010 - 99ms/epoch - 4ms/step
Epoch 24/50




25/25 - 0s - loss: 0.7329 - accuracy: 0.8030 - lr: 0.0010 - 107ms/epoch - 4ms/step
Epoch 25/50




25/25 - 0s - loss: 0.6903 - accuracy: 0.8258 - lr: 0.0010 - 94ms/epoch - 4ms/step
Epoch 26/50




25/25 - 0s - loss: 0.6383 - accuracy: 0.8434 - lr: 0.0010 - 142ms/epoch - 6ms/step
Epoch 27/50




25/25 - 0s - loss: 0.6973 - accuracy: 0.8308 - lr: 0.0010 - 158ms/epoch - 6ms/step
Epoch 28/50




25/25 - 0s - loss: 0.6618 - accuracy: 0.8447 - lr: 0.0010 - 139ms/epoch - 6ms/step
Epoch 29/50




25/25 - 0s - loss: 0.6521 - accuracy: 0.8460 - lr: 0.0010 - 145ms/epoch - 6ms/step
Epoch 30/50




25/25 - 0s - loss: 0.6283 - accuracy: 0.8422 - lr: 0.0010 - 124ms/epoch - 5ms/step
Epoch 31/50




25/25 - 0s - loss: 0.6002 - accuracy: 0.8662 - lr: 0.0010 - 196ms/epoch - 8ms/step
Epoch 32/50




25/25 - 0s - loss: 0.5901 - accuracy: 0.8725 - lr: 0.0010 - 86ms/epoch - 3ms/step
Epoch 33/50




25/25 - 0s - loss: 0.6184 - accuracy: 0.8472 - lr: 0.0010 - 142ms/epoch - 6ms/step
Epoch 34/50




25/25 - 0s - loss: 0.5978 - accuracy: 0.8561 - lr: 0.0010 - 158ms/epoch - 6ms/step
Epoch 35/50




25/25 - 0s - loss: 0.6249 - accuracy: 0.8460 - lr: 0.0010 - 147ms/epoch - 6ms/step
Epoch 36/50




25/25 - 0s - loss: 0.5603 - accuracy: 0.8902 - lr: 0.0010 - 133ms/epoch - 5ms/step
Epoch 37/50




25/25 - 0s - loss: 0.5689 - accuracy: 0.8801 - lr: 0.0010 - 147ms/epoch - 6ms/step
Epoch 38/50




25/25 - 0s - loss: 0.5678 - accuracy: 0.8699 - lr: 0.0010 - 187ms/epoch - 7ms/step
Epoch 39/50




25/25 - 0s - loss: 0.5531 - accuracy: 0.8750 - lr: 0.0010 - 143ms/epoch - 6ms/step
Epoch 40/50




25/25 - 0s - loss: 0.5629 - accuracy: 0.8725 - lr: 0.0010 - 152ms/epoch - 6ms/step
Epoch 41/50




25/25 - 0s - loss: 0.5330 - accuracy: 0.8851 - lr: 0.0010 - 133ms/epoch - 5ms/step
Epoch 42/50




25/25 - 0s - loss: 0.5330 - accuracy: 0.8801 - lr: 0.0010 - 126ms/epoch - 5ms/step
Epoch 43/50




25/25 - 0s - loss: 0.5437 - accuracy: 0.8801 - lr: 0.0010 - 154ms/epoch - 6ms/step
Epoch 44/50




25/25 - 0s - loss: 0.5067 - accuracy: 0.8851 - lr: 0.0010 - 168ms/epoch - 7ms/step
Epoch 45/50




25/25 - 0s - loss: 0.5617 - accuracy: 0.8699 - lr: 0.0010 - 160ms/epoch - 6ms/step
Epoch 46/50




25/25 - 0s - loss: 0.5024 - accuracy: 0.8990 - lr: 0.0010 - 153ms/epoch - 6ms/step
Epoch 47/50




25/25 - 0s - loss: 0.5498 - accuracy: 0.8851 - lr: 0.0010 - 169ms/epoch - 7ms/step
Epoch 48/50




25/25 - 0s - loss: 0.5025 - accuracy: 0.8813 - lr: 0.0010 - 156ms/epoch - 6ms/step
Epoch 49/50




25/25 - 0s - loss: 0.4837 - accuracy: 0.8990 - lr: 0.0010 - 145ms/epoch - 6ms/step
Epoch 50/50




25/25 - 0s - loss: 0.5221 - accuracy: 0.8838 - lr: 0.0010 - 175ms/epoch - 7ms/step
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
