In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('processed_data/train_data.csv')
pd.set_option('display.max_rows', 500)

In [None]:
#df.head(10)

In [None]:
def plot_feature_importance(model, model_name, feature_names):
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
        indices = np.argsort(importance)[::-1]
        
        plt.figure(figsize=(12, 8))
        plt.title(f'Feature Importance: {model_name}')
        plt.bar(range(len(feature_names)), importance[indices], align='center')
        plt.xticks(range(len(feature_names)), feature_names[indices], rotation=90)
        plt.xlim([-1, len(feature_names)])
        plt.show()
    else:
        print(f"Model {model_name} does not have feature_importances_ attribute")

In [None]:
X = df.drop(['Label','RunID'], axis=1)
y = df['Label']

# Remove classes with fewer than 2 instances
y = y[y.map(y.value_counts()) >= 2]
X = X.loc[y.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the classifiers and their hyperparameter grids
classifiers = {
    'RandomForest': (RandomForestClassifier(), {
        'n_estimators': [50, 100, 150, 200, 250],
        'max_depth': [None, 5, 10, 15, 20, 25],
        'min_samples_split': [2, 5, 10, 15, 20]
    }),
    'GradientBoosting': (GradientBoostingClassifier(), {
        'n_estimators': [50, 100, 150, 200, 250],
        'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
        'max_depth': [3, 5, 7, 9, 11]
    }),
    'SVC': (SVC(), {
        'C': [0.1, 0.5, 1, 5, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
    }),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance']
    }),
    'DecisionTree': (DecisionTreeClassifier(), {
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 15, 20]
    }),
    'NaiveBayes': (GaussianNB(), {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    })
}


# Dictionary to store the best models and their results
best_models = {}
results = []

# Perform hyperparameter tuning and evaluation for each classifier
for clf_name, (clf, param_grid) in classifiers.items():
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)    
    best_model = grid_search.best_estimator_
    
    # Plot feature importance using the best model from GridSearchCV
    plot_feature_importance(best_model, clf_name, X.columns)

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    
    best_models[clf_name] = best_model
    results.append({
        'Classifier': clf_name,
        'Best Params': grid_search.best_params_,
        'Accuracy': accuracy,
        'Classification Report': report
    })
    print(f"Classifier: {clf_name}")
    print(f"Best Params: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("\n" + "="*80 + "\n")

# Displaying the summary of results
results_df = pd.DataFrame(results)
print(results_df)