In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import os
import re


from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("./data/processed_csv.csv")

target = 'sg_encoded'
drop_cols = ['file']
X = df.drop(columns=[target] + drop_cols, errors='ignore')
y = df[target]

X = X.select_dtypes(include=[np.number])

value_counts = y.value_counts()
sufficient_classes = value_counts[value_counts >= 2].index
X = X[y.isin(sufficient_classes)]
y = y[y.isin(sufficient_classes)]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

In [5]:
X = X.select_dtypes(include=[np.number])

In [None]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
#    "GradientBoosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True)
}

In [7]:
cv_scores = {}
results = []

for name, model in tqdm(models.items()):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', model)
    ])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores[name] = scores
    results.append((name, scores.mean()))

100%|██████████| 6/6 [07:26<00:00, 74.43s/it] 


In [8]:
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
best_model_name = sorted_results[0][0]
best_model = models[best_model_name]

In [9]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', best_model)
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [10]:
print(f"Classification Report for {best_model_name}")
print(classification_report(y_test, y_pred))

Classification Report for DecisionTree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           3       1.00      1.00      1.00        14
           4       1.00      1.00      1.00       167
           5       1.00      1.00      1.00        14
           6       1.00      1.00      1.00        22
           7       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         6
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         3
          15       1.00      1.00      1.00         3
          16       1.00      1.00      1.00         1
          17       1.00      1.00      1.00         1
          19       0.00      0.00      0.00         0
          22       1.00      1.00      1.00         2
          23       0.67      1.00      0.8

In [None]:
plt.figure(figsize=(10, 6))

labels = [r[0] for r in sorted_results]
values = [r[1] for r in sorted_results]

barplot = sns.barplot(x=labels, y=values, palette="Set2", edgecolor='black')

for i, val in enumerate(values):
    barplot.text(i, val + 0.005, f"{val:.2f}", ha='center', va='bottom', fontsize=9)

plt.xticks(rotation=45)
plt.ylabel("Cross-Validated Accuracy")
plt.title("Model Comparison")

plt.tight_layout()
plt.show()

In [12]:
metrics_result = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

predictions_dict = {}

for name, model in tqdm(models.items()):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    predictions_dict[name] = y_pred

    metrics_result['Model'].append(name)
    metrics_result['Accuracy'].append(accuracy_score(y_test, y_pred))
    metrics_result['Precision'].append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
    metrics_result['Recall'].append(recall_score(y_test, y_pred, average='weighted', zero_division=0))
    metrics_result['F1 Score'].append(f1_score(y_test, y_pred, average='weighted', zero_division=0))

metrics_df = pd.DataFrame(metrics_result)
metrics_df.sort_values("Accuracy", ascending=False, inplace=True)

100%|██████████| 6/6 [01:47<00:00, 17.96s/it]


In [None]:
fig, axes = plt.subplots(4, 1, figsize=(10, 20), sharex=True)

sns.barplot(x='Accuracy', y='Model', data=metrics_df, ax=axes[0], palette='crest')
axes[0].set_title('Accuracy')
for i, val in enumerate(metrics_df["Accuracy"]):
    axes[0].text(val + 0.01, i, f"{val:.2f}", va='center')

sns.barplot(x='Precision', y='Model', data=metrics_df, ax=axes[1], palette='crest')
axes[1].set_title('Precision')
for i, val in enumerate(metrics_df["Precision"]):
    axes[1].text(val + 0.01, i, f"{val:.2f}", va='center')

sns.barplot(x='Recall', y='Model', data=metrics_df, ax=axes[2], palette='crest')
axes[2].set_title('Recall')
for i, val in enumerate(metrics_df["Recall"]):
    axes[2].text(val + 0.01, i, f"{val:.2f}", va='center')

sns.barplot(x='F1 Score', y='Model', data=metrics_df, ax=axes[3], palette='crest')
axes[3].set_title('F1 Score')
for i, val in enumerate(metrics_df["F1 Score"]):
    axes[3].text(val + 0.01, i, f"{val:.2f}", va='center')

plt.tight_layout()
plt.show()

In [14]:
sample_indexes = y_test.sample(30, random_state=42).index
sample_true = y_test.loc[sample_indexes]

plt.figure(figsize=(16, 10))

palette = sns.color_palette("hls", len(predictions_dict))
model_colors = dict(zip(predictions_dict.keys(), palette))

for i, (model_name, preds) in enumerate(predictions_dict.items()):
    sample_pred = pd.Series(preds, index=y_test.index).loc[sample_indexes]
    correct = (sample_pred == sample_true)
    
    df_plot = pd.DataFrame({
        'True Label': sample_true.values,
        'Predicted Label': sample_pred.values,
        'Correct': correct.map({True: "Correct", False: "Wrong"}),
        'Model': model_name
    })
    
    for j, row in df_plot.iterrows():
        plt.scatter(row['True Label'], row['Predicted Label'], color=model_colors[model_name], alpha=0.8, label=model_name if j == df_plot.index[0] else "", marker='o' if row['Correct'] == "Correct" else 'X', s=100)
        plt.text(row['True Label'], row['Predicted Label'], f"{row['Predicted Label']}", fontsize=8, ha='left', va='bottom')

handles = [plt.Line2D([0], [0], marker='o', color='w', label=name, markerfacecolor=color, markersize=10) for name, color in model_colors.items()]
plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', title="Models")

plt.title("Model-wise Predictions with Labels")
plt.xlabel("True Labels")
plt.ylabel("Predicted Labels")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
