In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Dropout
from keras.optimizers import Adam

# Load dataset
df = pd.read_csv('warehouse_temperature_humidity_2023.csv')

# Convert timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Convert HVAC_Status to numeric
df['HVAC_Status'] = df['HVAC_Status'].apply(lambda x: 1 if x == 'on' else 0)

# Outlier removal using the IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from Temperature and Humidity columns
df = remove_outliers(df, 'Temperature')
df = remove_outliers(df, 'Humidity')

# Define features and target
X = df[['Temperature', 'Humidity', 'HVAC_Status']]
y = df['Failure']

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance with SMOTE
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate the model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    return accuracy, roc_auc, classification_report(y_test, y_pred)

# Model selection
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}
for model_name, model in models.items():
    accuracy, roc_auc, report = evaluate_model(model, X_train_scaled, y_train_resampled, X_test_scaled, y_test)
    
    results[model_name] = {
        "Accuracy": accuracy,
        "ROC AUC": roc_auc,
        "Classification Report": report
    }

# Display the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"ROC AUC: {metrics['ROC AUC']:.4f}")
    print(metrics['Classification Report'])
    print("-" * 60)

# Example of hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search_rf.fit(X_train_scaled, y_train_resampled)

best_rf = grid_search_rf.best_estimator_
print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Evaluate the best Random Forest model
accuracy, roc_auc, report = evaluate_model(best_rf, X_train_scaled, y_train_resampled, X_test_scaled, y_test)

print(f"Best Random Forest Model Accuracy: {accuracy:.4f}")
print(f"Best Random Forest Model ROC AUC: {roc_auc:.4f}")
print(report)

# Function to build a CNN model
def build_cnn(input_shape):
    model = Sequential()
    model.add(Conv1D(32, kernel_size=2, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # For binary classification
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Reshape data for CNN
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)  # (samples, features, 1)
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Build and train the CNN model
cnn_model = build_cnn((X_train_cnn.shape[1], 1))
cnn_model.fit(X_train_cnn, y_train_resampled, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the CNN model
cnn_accuracy, cnn_roc_auc, cnn_report = evaluate_model(cnn_model, X_train_cnn, y_train_resampled, X_test_cnn, y_test)

print(f"CNN Model Accuracy: {cnn_accuracy:.4f}")
print(f"CNN Model ROC AUC: {cnn_roc_auc:.4f}")
print(cnn_report)

# Visualization for comparison
model_names = list(results.keys()) + ["CNN"]
accuracies = [results[model]['Accuracy'] for model in model_names if model in results] + [cnn_accuracy]
roc_aucs = [results[model]['ROC AUC'] for model in model_names if model in results] + [cnn_roc_auc]

plt.figure(figsize=(12, 6))

# Create subplots for Accuracy and ROC AUC
plt.subplot(1, 2, 1)
plt.bar(model_names + ["CNN"], accuracies, color=['blue', 'orange', 'green', 'red', 'purple'])
plt.ylim(0, 1)
plt.title('Model Accuracy')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.grid(axis='y')

plt.subplot(1, 2, 2)
plt.bar(model_names + ["CNN"], roc_aucs, color=['blue', 'orange', 'green', 'red', 'purple'])
plt.ylim(0, 1)
plt.title('Model ROC AUC')
plt.xlabel('Models')
plt.ylabel('ROC AUC')
plt.grid(axis='y')

# Show the plots
plt.tight_layout()
plt.show()


Model: Random Forest
Accuracy: 0.5731
ROC AUC: 0.4925
              precision    recall  f1-score   support

           0       0.72      0.67      0.69      1264
           1       0.28      0.33      0.30       488

    accuracy                           0.57      1752
   macro avg       0.50      0.50      0.50      1752
weighted avg       0.60      0.57      0.58      1752

------------------------------------------------------------
Model: Logistic Regression
Accuracy: 0.5051
ROC AUC: 0.5107
              precision    recall  f1-score   support

           0       0.73      0.50      0.60      1264
           1       0.28      0.51      0.36       488

    accuracy                           0.51      1752
   macro avg       0.50      0.51      0.48      1752
weighted avg       0.60      0.51      0.53      1752

------------------------------------------------------------
Model: SVM
Accuracy: 0.5051
ROC AUC: 0.5169
              precision    recall  f1-score   support

           

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
print(df['Failure'].value_counts())