### Imports and Setup

In [1]:
#handles getting dataset
#handles training loop 
#split and preprocesses dataset
#sends training requests to model classes
#averages model evaluation metrics returned by model classes


#import sklearn.datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from sklearn.metrics import auc, accuracy_score, precision_score, recall_score, f1_score
#from xgboost import XGBClassifier
import pandas as pd
import numpy as np

#importing models
from models.dt_model import dt_model
from models.rf_model import rf_model
from models.gb_model import gb_model
from models.knn_model import knn_model
from models.lr_model import lr_model

#choose seed for run
seed = 123456
#set number of training runs desired
runs = 10

### Load and Explore the Data

In [2]:
#Loading wine dataset as proof of concept
df = pd.read_csv("./data/wine.csv")
target_column = "Wine"
X = df.drop(columns=[target_column])
y = df[target_column]
df.head()

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### Initializing Models and Defining Result Arrays

In [3]:
#define helper models
dt = dt_model()
rf = rf_model()
gb = gb_model()
knn = knn_model()
lr = lr_model()

#define results arrays
dtEvalResults = np.zeros((runs, 4))
rfEvalResults = np.zeros((runs,4))
gbEvalResults = np.zeros((runs, 4))
knnEvalResults = np.zeros((runs, 4))
lrEvalResults = np.zeros((runs, 4))

### Training Loop

In [4]:
for r in range(runs):
    #first, split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

    #next, preprocess data
    #TODO: HANDLE DIFFERENT SCALING
    #TODO: Variance Threshold

    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    scaler = MinMaxScaler()
    X_train_scaled_standard = scaler.fit_transform(X_train)
    X_test_scaled_standard = scaler.transform(X_test)

    # Training the models and storing the evaluation results
    
    # decision tree
    dt.train(X_train_scaled_standard, y_train)
    dtEvalResults[r] = dt.predict(X_test_scaled_standard, y_test)
    
    # random forest
    rf.train(X_train_scaled_standard, y_train)
    rfEvalResults[r] = rf.predict(X_test_scaled_standard, y_test)
    
    # gradient boosting
    gb.train(X_train_scaled_standard, y_train)
    gbEvalResults[r] = gb.predict(X_test_scaled_standard, y_test)

    # k-nearest neighbors
    knn.train(X_train_scaled_standard, y_train)
    knnEvalResults[r] = knn.predict(X_test_scaled_standard, y_test)
    
    # logistic regression
    lr.train(X_train_scaled_standard, y_train)
    lrEvalResults[r] = lr.predict(X_test_scaled_standard, y_test)


### Computing & Printing Average Metrics Functions

In [5]:
def average_metrics(eval_results):
    avg_accuracy = np.mean(eval_results[:, 0])
    avg_precision = np.mean(eval_results[:, 1])
    avg_recall = np.mean(eval_results[:, 2])
    avg_f1 = np.mean(eval_results[:, 3])
    
    return avg_accuracy, avg_precision, avg_recall, avg_f1


def print_model_results(model_name, avg_accuracy, avg_precision, avg_recall, avg_f1):
    print(f"{model_name} Model:")
    print(f"Average Accuracy: {avg_accuracy}")
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1: {avg_f1}")
    print("\n")

### Displaying Average Metrics

In [6]:

# Computing average metrics
dt_avg_accuracy, dt_avg_precision, dt_avg_recall, dt_avg_f1 = average_metrics(dtEvalResults)
rf_avg_accuracy, rf_avg_precision, rf_avg_recall, rf_avg_f1 = average_metrics(rfEvalResults)
gb_avg_accuracy, gb_avg_precision, gb_avg_recall, gb_avg_f1 = average_metrics(gbEvalResults)
knn_avg_accuracy, knn_avg_precision, knn_avg_recall, knn_avg_f1 = average_metrics(gbEvalResults)
lr_avg_accuracy, lr_avg_precision, lr_avg_recall, lr_avg_f1 = average_metrics(lrEvalResults)

# Printing
print_model_results("Decision Tree", dt_avg_accuracy, dt_avg_precision, dt_avg_recall, dt_avg_f1)
print_model_results("Random Forest", rf_avg_accuracy, rf_avg_precision, rf_avg_recall, rf_avg_f1)
print_model_results("Gradient Boosting", gb_avg_accuracy, gb_avg_precision, gb_avg_recall, gb_avg_f1)
print_model_results("K-Nearest Neighbors", knn_avg_accuracy, knn_avg_precision, knn_avg_recall, knn_avg_f1)
print_model_results("Logistic Regression", lr_avg_accuracy, lr_avg_precision, lr_avg_recall, lr_avg_f1)


Decision Tree Model:
Average Accuracy: 0.8769230769230768
Average Precision: 0.884137827014382
Average Recall: 0.8769230769230768
Average F1: 0.876248298503658


Random Forest Model:
Average Accuracy: 0.9216783216783216
Average Precision: 0.9274112712141742
Average Recall: 0.9216783216783216
Average F1: 0.9210920741474947


Gradient Boosting Model:
Average Accuracy: 0.9013986013986013
Average Precision: 0.9132060612876325
Average Recall: 0.9013986013986013
Average F1: 0.9018580777160418


K-Nearest Neighbors Model:
Average Accuracy: 0.9013986013986013
Average Precision: 0.9132060612876325
Average Recall: 0.9013986013986013
Average F1: 0.9018580777160418


Logistic Regression Model:
Average Accuracy: 0.9573426573426573
Average Precision: 0.9611469923506757
Average Recall: 0.9573426573426573
Average F1: 0.9569922103692304


