### Imports and Setup

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# importing models
from utils.models import model_mapping

seed = 123456
#runs = 10 # for ease when coding
runs = 100 # for final experiments

### Loading and Exploring the Data

In [2]:
df = pd.read_csv("./data/wine.csv")  # dummy data
X = df.drop(columns=["Wine"])
y = df["Wine"]
df.head()

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### Preparing Data and Initializing Models

In [3]:
# encoding labels
le = LabelEncoder()
y = le.fit_transform(y)

# initializing models from mapping
models = {name: model_class() for name, model_class in model_mapping.items()}

# dictionary to store model evaluation results
model_results = {name: np.zeros((runs, 5)) for name in models}

### Training Loop

In [4]:
for r in range(runs):
    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=seed + r)

    # scaling
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)
    
    # training & evaluating each model
    for name, current_model in models.items():
        current_model.train(X_train_scaled, y_train)
        model_results[name][r, :] = current_model.predict(X_test_scaled, y_test)

### Computing & Printing Average Metrics Functions

In [5]:
# computing average metrics
average_results = {
    name: np.mean(results, axis=0) for name, results in model_results.items()
}

# printing results
for name, metrics in average_results.items():
    print(f"\n{name} Model:")
    print(f"  AUC: {metrics[0]:.5f}, Accuracy: {metrics[1]:.5f}, "
          f"Precision: {metrics[2]:.5f}, Recall: {metrics[3]:.5f}, F1: {metrics[4]:.5f}")


Decision Tree Model:
  AUC: 0.88876, Accuracy: 0.85343, Precision: 0.86424, Recall: 0.85343, F1: 0.85193

Random Forest Model:
  AUC: 0.98658, Accuracy: 0.91476, Precision: 0.92148, Recall: 0.91476, F1: 0.91415

Gradient Boosting Model:
  AUC: 0.97555, Accuracy: 0.87524, Precision: 0.89112, Recall: 0.87524, F1: 0.87435

Extreme Gradient Boosting Model:
  AUC: 0.98560, Accuracy: 0.90510, Precision: 0.91306, Recall: 0.90510, F1: 0.90437

Light Gradient Boosting Model Model:
  AUC: 0.99014, Accuracy: 0.92727, Precision: 0.93110, Recall: 0.92727, F1: 0.92687

Extremely Randomized Trees Model:
  AUC: 0.99783, Accuracy: 0.96035, Precision: 0.96332, Recall: 0.96035, F1: 0.96017

Adaptive Boosting Model:
  AUC: 0.96114, Accuracy: 0.88175, Precision: 0.89161, Recall: 0.88175, F1: 0.88102

Logistic Regression Model:
  AUC: 0.99518, Accuracy: 0.95063, Precision: 0.95542, Recall: 0.95063, F1: 0.95031

Logistic Regression - Lasso (L1) Model:
  AUC: 0.98871, Accuracy: 0.91594, Precision: 0.92483, R

### Best Model Selection and Saving

In [6]:
# best model selection with priority on AUC while using other metrics as tiebreakers
best_model = max(average_results, key=lambda name: tuple(average_results[name][::-1]))  # Reverse to prioritize AUC
best_metrics = average_results[best_model]


print(f"\nBest Model: {best_model}")
print(f"  AUC: {best_metrics[4]:.5f}, Accuracy: {best_metrics[0]:.5f}, "
      f"Precision: {best_metrics[1]:.5f}, Recall: {best_metrics[2]:.5f}, F1: {best_metrics[3]:.5f}")

# saving the results for feature engineering
with open("./data/best_model.txt", "w") as f:
    f.write(f"Best Model: {best_model}\n")
    f.write(f"AUC: {best_metrics[0]:.5f}\nAccuracy: {best_metrics[1]:.5f}\n"
            f"Precision: {best_metrics[2]:.5f}\nRecall: {best_metrics[3]:.5f}\nF1: {best_metrics[4]:.5f}\n")


Best Model: Non-linear SVM
  AUC: 0.96360, Accuracy: 0.99796, Precision: 0.96364, Recall: 0.96602, F1: 0.96364
