In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [33]:
models = {
    "LogisticRegression":LogisticRegression(),
    "KNeighborsClassifier":KNeighborsClassifier(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "SVC":SVC(),
    "GaussianNB":GaussianNB(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "MLPClassifier":MLPClassifier(),
    "XGBClassifier":XGBClassifier(),
    "CatBoostClassifier":CatBoostClassifier(),
    "LGBMClassifier":LGBMClassifier()  
}

In [34]:
import pandas as pd
train_df = pd.read_csv("../artifacts/transformed data/transformed.csv")
train_df.head()

Unnamed: 0,contact_0,contact_1,marital_0,marital_1,age,job,education,default,balance,housing,loan,day,month,duration,campaign,previous,poutcome,y
0,0.0,1.0,1.0,0.0,0.769231,0.105979,3,0,0.760385,1,0,0.688967,0.866025,0.411672,0.693147,0.0,-1,0
1,0.0,1.0,0.0,1.0,0.5,0.080274,2,0,0.361216,1,0,0.688967,0.866025,0.23817,0.693147,0.0,-1,0
2,0.0,1.0,1.0,0.0,0.288462,0.047316,2,0,0.356118,1,1,0.688967,0.866025,0.119874,0.693147,0.0,-1,0
3,0.0,1.0,1.0,0.0,0.557692,0.040846,-1,0,0.640106,1,0,0.688967,0.866025,0.14511,0.693147,0.0,-1,0
4,0.0,1.0,0.0,1.0,0.288462,0.100917,-1,0,0.355929,0,0,0.688967,0.866025,0.312303,0.693147,0.0,-1,0


In [35]:
from imblearn.over_sampling import SMOTE

x_train,y_train = train_df.iloc[:,:-1],train_df.iloc[:,-1]
smote = SMOTE(sampling_strategy='auto')
x,y = smote.fit_resample(x_train, y_train)

In [17]:
from sklearn.model_selection import KFold,cross_val_score

k_folds = KFold(n_splits=5)
cross_val_scores = {"model":[],"f1 score":[]}

for name,instance in models.items():
    print(f"== {name} training started ==")
    
    f1 = cross_val_score(instance,x,y,cv=k_folds,scoring='f1').mean()
    cross_val_scores["model"].append(name)
    cross_val_scores["f1 score"].append(f1)
    
    print(f"{name} training completed...")
    
cross_val_score_df = pd.DataFrame(data = cross_val_scores)

== LogisticRegression training started ==


LogisticRegression training completed...
== KNeighborsClassifier training started ==
KNeighborsClassifier training completed...
== DecisionTreeClassifier training started ==
DecisionTreeClassifier training completed...
== RandomForestClassifier training started ==
RandomForestClassifier training completed...
== SVC training started ==
SVC training completed...
== GaussianNB training started ==
GaussianNB training completed...
== GradientBoostingClassifier training started ==
GradientBoostingClassifier training completed...
== MLPClassifier training started ==




MLPClassifier training completed...
== XGBClassifier training started ==
XGBClassifier training completed...
== CatBoostClassifier training started ==
Learning rate set to 0.049155
0:	learn: 0.6372731	total: 16.9ms	remaining: 16.9s
1:	learn: 0.5932533	total: 36.2ms	remaining: 18.1s
2:	learn: 0.5552238	total: 51.9ms	remaining: 17.2s
3:	learn: 0.5192622	total: 68.1ms	remaining: 16.9s
4:	learn: 0.4897280	total: 83.4ms	remaining: 16.6s
5:	learn: 0.4537161	total: 99.2ms	remaining: 16.4s
6:	learn: 0.4328358	total: 115ms	remaining: 16.4s
7:	learn: 0.4155654	total: 133ms	remaining: 16.4s
8:	learn: 0.4013354	total: 149ms	remaining: 16.4s
9:	learn: 0.3891406	total: 166ms	remaining: 16.4s
10:	learn: 0.3718006	total: 181ms	remaining: 16.3s
11:	learn: 0.3614674	total: 197ms	remaining: 16.2s
12:	learn: 0.3507410	total: 213ms	remaining: 16.2s
13:	learn: 0.3413093	total: 237ms	remaining: 16.7s
14:	learn: 0.3331608	total: 259ms	remaining: 17s
15:	learn: 0.3257660	total: 276ms	remaining: 17s
16:	learn: 

In [18]:
cross_val_score_df.sort_values(by='f1 score',ascending=False)

Unnamed: 0,model,f1 score
3,RandomForestClassifier,0.600635
7,MLPClassifier,0.598799
4,SVC,0.584408
5,GaussianNB,0.5834
0,LogisticRegression,0.581702
6,GradientBoostingClassifier,0.578664
1,KNeighborsClassifier,0.573533
2,DecisionTreeClassifier,0.508322
8,XGBClassifier,0.504295
10,LGBMClassifier,0.484957


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report
from pickle import dump

f1_scores = {"model":[],"f1 score":[]}
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y)

for name,instance in models.items():
    print(f"== {name} training started ==")
    
    instance.fit(x_train,y_train)
    predictions = instance.predict(x_test)
    f1 = f1_score(y_test,predictions)
    print(classification_report(y_test,predictions))
    f1_scores["model"].append(name)
    f1_scores["f1 score"].append(f1)
    dump(instance,open(f"../models/base models/{name}.pkl","wb"))
    
    print(f"{name} training completed...")

f1_scores_df = pd.DataFrame(data=f1_scores)

== LogisticRegression training started ==
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      6336
           1       0.80      0.83      0.82      6336

    accuracy                           0.81     12672
   macro avg       0.81      0.81      0.81     12672
weighted avg       0.81      0.81      0.81     12672

LogisticRegression training completed...
== KNeighborsClassifier training started ==
              precision    recall  f1-score   support

           0       0.99      0.86      0.92      6336
           1       0.87      0.99      0.93      6336

    accuracy                           0.92     12672
   macro avg       0.93      0.92      0.92     12672
weighted avg       0.93      0.92      0.92     12672

KNeighborsClassifier training completed...
== DecisionTreeClassifier training started ==
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      6336
           1       0.



              precision    recall  f1-score   support

           0       0.92      0.86      0.89      6336
           1       0.87      0.93      0.90      6336

    accuracy                           0.89     12672
   macro avg       0.89      0.89      0.89     12672
weighted avg       0.89      0.89      0.89     12672

MLPClassifier training completed...
== XGBClassifier training started ==
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      6336
           1       0.97      0.95      0.96      6336

    accuracy                           0.96     12672
   macro avg       0.96      0.96      0.96     12672
weighted avg       0.96      0.96      0.96     12672

XGBClassifier training completed...
== CatBoostClassifier training started ==
Learning rate set to 0.05507
0:	learn: 0.6541523	total: 21.3ms	remaining: 21.2s
1:	learn: 0.6220485	total: 38.1ms	remaining: 19s
2:	learn: 0.5858785	total: 56.3ms	remaining: 18.7s
3:	learn: 0.562

In [37]:
f1_scores_df.sort_values(by='f1 score',ascending=False)

Unnamed: 0,model,f1 score
9,CatBoostClassifier,0.961742
3,RandomForestClassifier,0.960945
8,XGBClassifier,0.960217
10,LGBMClassifier,0.957043
1,KNeighborsClassifier,0.92765
6,GradientBoostingClassifier,0.924676
2,DecisionTreeClassifier,0.920044
7,MLPClassifier,0.896683
4,SVC,0.866956
0,LogisticRegression,0.817553


In [27]:
train_df.head()

Unnamed: 0,age,job,education,balance,housing,loan,day,month,duration,campaign,previous,poutcome,y,marital_0,marital_1,contact_0,contact_1
0,0.769231,0.105979,3,0.760385,1,0,0.688967,0.866025,0.411672,0.693147,0.0,-1,0,1.0,0.0,0.0,1.0
1,0.5,0.080274,2,0.361216,1,0,0.688967,0.866025,0.23817,0.693147,0.0,-1,0,0.0,1.0,0.0,1.0
2,0.288462,0.047316,2,0.356118,1,1,0.688967,0.866025,0.119874,0.693147,0.0,-1,0,1.0,0.0,0.0,1.0
3,0.557692,0.040846,-1,0.640106,1,0,0.688967,0.866025,0.14511,0.693147,0.0,-1,0,1.0,0.0,0.0,1.0
4,0.288462,0.100917,-1,0.355929,0,0,0.688967,0.866025,0.312303,0.693147,0.0,-1,0,0.0,1.0,0.0,1.0


In [10]:
top_3_models = f1_scores_df.sort_values(by='f1 score',ascending=False).head(3)["model"].values

In [11]:
import numpy as np

model_parameters = {

    'LogisticRegression':{
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'penalty' : ['l2'],
        'C' : [100, 10, 1.0, 0.1, 0.01]
    },

    'KNeighborsClassifier':{
        'n_neighbors' : [5,7,9,11,13,15],
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
    },

    'SVC':{
        'C': [0.1, 1, 10, 100, 1000],  
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
        'kernel': ['rbf']
    },

    'GaussianNB':{
        'var_smoothing': np.logspace(0,-9, num=100)
    },

    'DecisionTreeClassifier':{
        'max_depth': [2, 3, 5, 10, 20],
        'min_samples_leaf': [5, 10, 20, 50, 100],
        'criterion': ["gini", "entropy"]
    },

    'RandomForestClassifier':{
        'n_estimators':[10,50,100,130],
        'criterion':['gini','entropy'],
        'max_depth':range(2,4,1),
        'max_features':['auto','log2']
    },

    'MLPClassifier':{
        'hidden_layer_sizes': [(10,30,10),(20,)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive']
    },

    'GradientBoostingClassifier':{
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        'min_samples_split':range(400,1200,100), 
        'min_samples_leaf':range(30,71,10),
        'max_features':range(7,20,2)
    },

    'XGBClassifier':{
        'learning_rate':[0.5,0.1,0.1,0.01],
        'max_depth':[3,5,10,20],
        'n_estimators':[10,50,100,200]

    },

    'CatBoostClassifier':{
        'learning_rate': np.linspace(0,0.2,5),
        'max_depth': [3,4,5],
        'n_estimators':[100, 200, 300]
    },

    'LGBMClassifier' : {
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05       
    } 
}

In [None]:
from sklearn.model_selection import GridSearchCV
from pickle import dump

fined_tuned_f1_scores = {"model":[],"f1 score":[]}
x,y = train_df.iloc[:,:-1],train_df.iloc[:,-1]

for name,instance in models.items():
    if name not in top_3_models:
        continue
    
    print(f"== {name} training started ==")
    params = model_parameters[name]
    cv_instance = GridSearchCV(estimator=instance,param_grid=params,cv=5,return_train_score=False,scoring='f1')
    cv_instance.fit(x,y)
    dump(cv_instance,open(f"../models/fine-tuned models/{name}.pkl","wb"))
    fined_tuned_f1_scores["model"].append(name)
    fined_tuned_f1_scores["f1 score"].append(cv_instance.best_score_)
    print(f"{name} training completed...")
    
fined_tuned_f1_scores_df = pd.DataFrame(data=fined_tuned_f1_scores)    

== GradientBoostingClassifier training started ==


In [None]:
from pickle import load

best_model_name = fined_tuned_f1_scores_df.sort_values(by="f1 score").head(1)
best_model = load(open(f"../models/fine-tuned models/{best_model_name}.pkl","rb"))
dump(best_model,open(f"../models/best model/{best_model_name}.pkl","wb"))