### Imports and Setup

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

#importing models
from models.dt_model import dt_model
from models.rf_model import rf_model
from models.extra_trees_model import extra_trees_model
from models.gb_model import gb_model
from models.knn_model import knn_model
from models.lr_model import lr_model
from models.lr_l1_model import lr_l1_model
from models.lr_l2_model import lr_l2_model
from models.lr_enet_model import lr_enet_model
from models.gnb_model import gnb_model
from models.lda_model import lda_model
from models.svm_linear_model import svm_linear_model
from models.svm_non_linear_model import svm_non_linear_model
from models.mlp_model import mlp_model
from models.adaboost_model import adaboost_model
from models.xgb_model import xgb_model
from models.lgbm_model import lgbm_model

seed = 123456
runs = 10 # 100 for final experiments

### Loading and Exploring the Data

In [2]:
df = pd.read_csv("./data/wine.csv")  # dummy data
X = df.drop(columns=["Wine"])
y = df["Wine"]
df.head()

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### Preparing Data and Initializing Models

In [3]:
# encoding labels
le = LabelEncoder()
y = le.fit_transform(y)

models = {
    "DT": dt_model(),
    "RF": rf_model(),
    "ExtraTrees": extra_trees_model(),
    "GB": gb_model(),
    "KNN": knn_model(),
    "LR": lr_model(),
    "LR-L1": lr_l1_model(),
    "LR-L2": lr_l2_model(),
    "LR-ENet": lr_enet_model(),
    "GNB": gnb_model(),
    "LDA": lda_model(),
    "SVM-Linear": svm_linear_model(),
    "SVM-NonLinear": svm_non_linear_model(),
    "MLP": mlp_model(),
    "AdaBoost": adaboost_model(),
    "XGB": xgb_model(),
    "LBGM" : lgbm_model(),
}
# dictionary to store model evaluation results
model_results = {name: np.zeros((runs, 5)) for name in models}

### Training Loop

In [4]:
for r in range(runs):
    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=seed + r)

    # scaling
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)
    
    # training & evaluating each model
    for name, model in models.items():
        model.train(X_train_scaled, y_train)
        model_results[name][r, :] = model.predict(X_test_scaled, y_test)

### Computing & Printing Average Metrics Functions

In [5]:
# computing average metrics
average_results = {
    name: np.mean(results, axis=0) for name, results in model_results.items()
}

# printing results
for name, metrics in average_results.items():
    print(f"\n{name} Model:")
    print(f"  AUC: {metrics[4]:.5f}, Accuracy: {metrics[0]:.5f}, "
          f"Precision: {metrics[1]:.5f}, Recall: {metrics[2]:.5f}, F1: {metrics[3]:.5f}")


DT Model:
  AUC: 0.88950, Accuracy: 0.85315, Precision: 0.86224, Recall: 0.85315, F1: 0.85163

RF Model:
  AUC: 0.98757, Accuracy: 0.91818, Precision: 0.92311, Recall: 0.91818, F1: 0.91793

ExtraTrees Model:
  AUC: 0.99766, Accuracy: 0.96154, Precision: 0.96435, Recall: 0.96154, F1: 0.96146

GB Model:
  AUC: 0.97221, Accuracy: 0.86573, Precision: 0.87985, Recall: 0.86573, F1: 0.86519

KNN Model:
  AUC: 0.99446, Accuracy: 0.94825, Precision: 0.95179, Recall: 0.94825, F1: 0.94791

LR Model:
  AUC: 0.99689, Accuracy: 0.95734, Precision: 0.96028, Recall: 0.95734, F1: 0.95741

LR-L1 Model:
  AUC: 0.98972, Accuracy: 0.92238, Precision: 0.92742, Recall: 0.92238, F1: 0.92209

LR-L2 Model:
  AUC: 0.99689, Accuracy: 0.95734, Precision: 0.96028, Recall: 0.95734, F1: 0.95741

LR-ENet Model:
  AUC: 0.99654, Accuracy: 0.95874, Precision: 0.96087, Recall: 0.95874, F1: 0.95875

GNB Model:
  AUC: 0.98537, Accuracy: 0.93007, Precision: 0.93454, Recall: 0.93007, F1: 0.92997

LDA Model:
  AUC: 0.99250, A

### Best Model Selection and Saving

In [6]:
# best model selection with priority on AUC while using other metrics as tiebreakers
best_model = max(average_results, key=lambda name: tuple(average_results[name][::-1]))  # Reverse to prioritize AUC
best_metrics = average_results[best_model]


print(f"\nBest Model: {best_model}")
print(f"  AUC: {best_metrics[4]:.5f}, Accuracy: {best_metrics[0]:.5f}, "
      f"Precision: {best_metrics[1]:.5f}, Recall: {best_metrics[2]:.5f}, F1: {best_metrics[3]:.5f}")

# saving the results for feature engineering
with open("./data/best_model.txt", "w") as f:
    f.write(f"Best Model: {best_model}\n")
    f.write(f"AUC: {best_metrics[4]:.5f}\nAccuracy: {best_metrics[0]:.5f}\n"
            f"Precision: {best_metrics[1]:.5f}\nRecall: {best_metrics[2]:.5f}\nF1: {best_metrics[3]:.5f}\n")


Best Model: ExtraTrees
  AUC: 0.99766, Accuracy: 0.96154, Precision: 0.96435, Recall: 0.96154, F1: 0.96146
