Imports and Setup

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np

#importing models
from models.dt_model import dt_model
from models.rf_model import rf_model
from models.extra_trees_model import extra_trees_model
from models.gb_model import gb_model
from models.knn_model import knn_model
from models.lr_model import lr_model
from models.lr_l1_model import lr_l1_model
from models.lr_l2_model import lr_l2_model
from models.lr_enet_model import lr_enet_model
from models.gnb_model import gnb_model
from models.lda_model import lda_model
from models.svm_linear_model import svm_linear_model
from models.svm_non_linear_model import svm_non_linear_model
from models.mlp_model import mlp_model
from models.adaboost_model import adaboost_model
from models.xgb_model import xgb_model
from models.lgbm_model import lgbm_model

seed = 123456
runs = 1 # 100 for final experiments

Loading Dataset

In [2]:
df = pd.read_csv("./data/wine.csv")  # dummy data
X = df.drop(columns=["Wine"])
y = df["Wine"]
df.head()

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


Prepping and Initializing Models

In [3]:
# encoding labels
le = LabelEncoder()
y = le.fit_transform(y)

models = {
    "LBGM" : lgbm_model()
}
# dictionary to store model evaluation results
model_results = {name: np.zeros((runs, 5)) for name in models}

Tuning Loop

In [4]:
for r in range(runs):
    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=seed+r)

    # scaling
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)
    
    # tuning each model
    for name, model in models.items():
        params = model.getParams()
        model.initCV(params)
        model.CVTune(X_train_scaled, y_train, X_test_scaled, y_test)
        model.CVResults()
        model.CVPredict(X_test_scaled, y_test)

Best parameters found:  {'learning rate': 0.001, 'max_depth': 1, 'min_data_in_leaf': 5, 'num_iterations': 50, 'num_leaves': 30}
Best score found:  0.9714285714285715
Test set accuracy of the best model:  0.9370629370629371
