In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [35]:
models = {
    "LogisticRegression":LogisticRegression(),
    "KNeighborsClassifier":KNeighborsClassifier(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "SVC":SVC(),
    "GaussianNB":GaussianNB(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "MLPClassifier":MLPClassifier(),
    "XGBClassifier":XGBClassifier(),
    "CatBoostClassifier":CatBoostClassifier(),
    "LGBMClassifier":LGBMClassifier()  
}

In [36]:
import pandas as pd
train_df = pd.read_csv("../artifacts/label encoded data/label encoded.csv")
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [None]:
from sklearn.model_selection import KFold,cross_val_score

k_folds = KFold(n_splits=5)
cross_val_scores = {"model":[],"f1 score":[]}
x,y = train_df.iloc[:,:-1],train_df.iloc[:,-1]

for name,instance in models.items():
    print(f"== {name} training started ==")
    
    f1 = cross_val_score(instance,x,y,cv=k_folds,scoring='f1').mean()
    cross_val_scores["model"].append(name)
    cross_val_scores["f1 score"].append(f1)
    
    print(f"{name} training completed...")

cross_val_score_df = pd.DataFrame(data = cross_val_scores)

In [41]:
cross_val_score_df.sort_values(by='f1 score',ascending=False)

Unnamed: 0,model,f1 score
6,GradientBoostingClassifier,0.480646
9,CatBoostClassifier,0.445422
10,LGBMClassifier,0.445162
8,XGBClassifier,0.40996
3,RandomForestClassifier,0.378548
5,GaussianNB,0.334446
0,LogisticRegression,0.32515
1,KNeighborsClassifier,0.306586
2,DecisionTreeClassifier,0.23668
7,MLPClassifier,0.179666


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from pickle import dump

f1_scores = {"model":[],"f1 score":[]}
x,y = train_df.iloc[:,:-1],train_df.iloc[:,-1]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y)

for name,instance in models.items():
    print(f"== {name} training started ==")
    
    instance.fit(x_train,y_train)
    predictions = instance.predict(x_test)
    f1 = f1_score(predictions,y_test)
    f1_scores["model"].append(name)
    f1_scores["f1 score"].append(f1)
    dump(instance,open(f"../models/base models/{name}.pkl","wb"))
    
    print(f"{name} training completed...")

f1_scores_df = pd.DataFrame(data=f1_scores)

In [46]:
f1_scores_df.sort_values(by='f1 score',ascending=False)

Unnamed: 0,model,f1 score
8,XGBClassifier,0.559072
9,CatBoostClassifier,0.544565
10,LGBMClassifier,0.542763
3,RandomForestClassifier,0.495072
6,GradientBoostingClassifier,0.478469
2,DecisionTreeClassifier,0.460674
5,GaussianNB,0.396107
1,KNeighborsClassifier,0.331485
7,MLPClassifier,0.317744
0,LogisticRegression,0.259098


In [None]:
top_3_models = f1_scores_df.sort_values(by='f1 score',ascending=False).head(3)["model"].values

In [47]:
import numpy as np

model_parameters = {

    'LogisticRegression':{
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'penalty' : ['l2'],
        'C' : [100, 10, 1.0, 0.1, 0.01]
    },

    'KNeighborsClassifier':{
        'n_neighbors' : [5,7,9,11,13,15],
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
    },

    'SVC':{
        'C': [0.1, 1, 10, 100, 1000],  
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
        'kernel': ['rbf']
    },

    'GaussianNB':{
        'var_smoothing': np.logspace(0,-9, num=100)
    },

    'DecisionTreeClassifier':{
        'max_depth': [2, 3, 5, 10, 20],
        'min_samples_leaf': [5, 10, 20, 50, 100],
        'criterion': ["gini", "entropy"]
    },

    'RandomForestClassifier':{
        'n_estimators':[10,50,100,130],
        'criterion':['gini','entropy'],
        'max_depth':range(2,4,1),
        'max_features':['auto','log2']
    },

    'MLPClassifier':{
        'hidden_layer_sizes': [(10,30,10),(20,)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive']
    },

    'GradientBoostingClassifier':{
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        'min_samples_split':range(400,1200,100), 
        'min_samples_leaf':range(30,71,10),
        'max_features':range(7,20,2)
    },

    'XGBClassifier':{
        'learning_rate':[0.5,0.1,0.1,0.01],
        'max_depth':[3,5,10,20],
        'n_estimators':[10,50,100,200]

    },

    'CatBoostClassifier':{
        'learning_rate': np.linspace(0,0.2,5),
        'max_depth': [3,4,5],
        'n_estimators':[100, 200, 300]
    },

    'LGBMClassifier' : {
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05       
    } 
}

In [None]:
from sklearn.model_selection import GridSearchCV
from pickle import dump

fined_tuned_f1_scores = {"model":[],"f1 score":[]}
x,y = train_df.iloc[:,:-1],train_df.iloc[:,-1]

for name,instance in models.items():
    if name not in top_3_models:
        continue
    
    print(f"== {name} training started ==")
    params = model_parameters[name]
    cv_instance = GridSearchCV(estimator=instance,param_grid=params,cv=5,return_train_score=False,scoring='f1')
    cv_instance.fit(x,y)
    dump(cv_instance,open(f"../models/fine-tuned models/{name}.pkl","wb"))
    fined_tuned_f1_scores["model"].append(name)
    fined_tuned_f1_scores["f1 score"].append(cv_instance.best_score_)
    print(f"{name} training completed...")
    
fined_tuned_f1_scores_df = pd.DataFrame(data=fined_tuned_f1_scores)    

In [None]:
from pickle import load

best_model_name = fined_tuned_f1_scores_df.sort_values(by="f1 score").head(1)
best_model = load(open(f"../models/fine-tuned models/{best_model_name}.pkl","rb"))
dump(best_model,open(f"../models/best model/{best_model_name}.pkl","wb"))