### Imports

In [None]:
# ML imports
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder 
from xgboost import XGBClassifier 

# Data analysis and stats imports
import numpy as np
import pandas as pd
from scipy.stats import expon, reciprocal

# Data visualization imports
import seaborn as sns
import matplotlib.pyplot as plt 


### Get data

In [None]:
train_df = pd.read_csv('C:\\Users\\Kamen\ML\\Projet-STT-3795\\src\\train_preprocessed_data')
test_df = pd.read_csv('C:\\Users\\Kamen\\ML\\Projet-STT-3795\\src\\test_preprocessed_data')
validation_df = pd.read_csv('C:\\Users\\Kamen\\ML\\Projet-STT-3795\\src\\validation_preprocessed_data')

df = pd.concat([train_df, test_df, validation_df]).reset_index().iloc[:, 2:]
df.rename(columns={'0': 'label'}, inplace=True)

df_without_label = df.iloc[:, 0:-1]

df_without_label

### Principal components

In [None]:
def get_PCs(dataframe, percentage_variance):
    print()
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(dataframe)
    print(f'Scaled_df Mean = {np.mean(scaled_df)},\nScaled_df Std = {np.std(scaled_df)}')


    pca_T = PCA()
    pca_T.fit_transform(scaled_df)
    ev = pca_T.explained_variance_
    print()
    print(f'Total variance = {sum(ev)}')

    pca = PCA(percentage_variance/100)
    principal_components = pca.fit_transform(scaled_df)
    explained_variance = pca.explained_variance_
    percentage = sum(pca.explained_variance_ratio_)
    print(f'Real percentage = {percentage}')
    print(f'Variance for {round(percentage*100, 2)}% = {sum(explained_variance)}')
    print(f'Number of PCs for {round(percentage*100, 2)}% = {len(explained_variance)}')
    print(f'Attribute lost = {len(scaled_df[0]) - len(explained_variance)}')
    names = pca.get_feature_names_out()
    return pd.DataFrame(data=principal_components, columns=names)


In [None]:
df_pca = get_PCs(df_without_label, 99)
df_pca['label'] = df['label']
df_pca

### Models

RFC and SVM

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df_pca.iloc[:, 0:-1], df['label'], stratify=df['label'], test_size=0.3, random_state=42)

"""
# Model initialization
svm = SVC(verbose=3)
random_forest = RandomForestClassifier(verbose=3)
nn = MLPClassifier(verbose=3)

# Models fiting
print("Training SVM")
svm.fit(X_train, Y_train)
print("Training RFC")
random_forest.fit(X_train, Y_train)
#nn.fit(X_train, Y_train)

# Models prediction
svm_predictions = svm.predict(X_test)
random_forest_predictions = svm.predict(X_test)
#nn_predictions = svm.predict(X_test)
"""


XGBClassifier

In [None]:
xgbc = XGBClassifier()

# Languages -> int
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train)
Y_test = label_encoder.fit_transform(Y_test)

xgbc.fit(X_train, Y_train)
xgbc_predictions = xgbc.predict(X_test)
xgbc_cm = confusion_matrix(label_encoder.inverse_transform(Y_test),label_encoder.inverse_transform(xgbc_predictions))
xgbc_report = classification_report(label_encoder.inverse_transform(Y_test),label_encoder.inverse_transform(xgbc_predictions))
xgbc_accuracy = accuracy_score(label_encoder.inverse_transform(Y_test),label_encoder.inverse_transform(xgbc_predictions))
print(xgbc_cm)
print(xgbc_report)

RandomizedSearch XGBoost

In [None]:
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [80, 100, 200],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1], 
    'alpha' : [1e-5, 1e-2, 0.1, 1, 100], 
}

xgbc = XGBClassifier(random_state=42)

# Accuracy = (TP + TN)/(TP + TN + FP + FN) -> num pred correct / total number of pred
grid_search = RandomizedSearchCV(estimator=xgbc, param_grid=param_grid, n_iter = 100, scoring='f1', cv=5, verbose=1, random_state = 42)
grid_search.fit(X_train, Y_train)
# Print the best parameters and the best F1 score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best F1 score: {grid_search.best_score_}")
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test)
test_f1_score = f1_score(Y_test, y_pred)

print(f"Test F1 Score: {test_f1_score}")

RandomizedSearch SVM

In [None]:
param_grid = {
    'C': reciprocal(0.001, 1000),
    'gamma': expon(scale=1.0),
    'kernel': ['linear', 'rbf', 'poly']
}

svm_clf = SVC(random_state=42)
random_search_svm = RandomizedSearchCV(svm_clf, param_grid, n_iter=100, verbose=2, cv=5, random_state=42, n_jobs=-1, scoring = 'f1')
random_search_svm.fit(X_train, Y_train)
print("Best parameters for SVM:", random_search_svm.best_params_)
print("Best score:", random_search_svm.best_score_)

RandomizedSearch Random Forest Classifier

In [None]:
param_grid_rf = {
    'n_estimators': [80, 100, 200],
    'max_depth': [3, 4, 5, None],  
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None] 
}

rfc = RandomForestClassifier(random_state=42)

random_search_rf = RandomizedSearchCV(rfc, param_grid_rf, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring = 'f1')
random_search_rf.fit(X_train, Y_train)
print("Best parameters:", random_search_rf.best_params_)
print("Best score:", random_search_rf.best_score_)

XGBClassifier evaluation

In [None]:
xgbc_cm = confusion_matrix(label_encoder.inverse_transform(Y_test),label_encoder.inverse_transform(xgbc_predictions))
xgbc_report = classification_report(label_encoder.inverse_transform(Y_test),label_encoder.inverse_transform(xgbc_predictions))
xgbc_accuracy = accuracy_score(label_encoder.inverse_transform(Y_test),label_encoder.inverse_transform(xgbc_predictions))
print(xgbc_accuracy)
print(xgbc_cm)
# Plotting the confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(xgbc_cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
print(xgbc_report)