### EDA

In [7]:
import numpy as np 
import pandas as pd
import mlflow
from mlflow.models import infer_signature
import dagshub
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
import pickle
from imblearn.over_sampling import SMOTE,RandomOverSampler
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import warnings

warnings.warn('ignore')





In [8]:
algorithms = {
    'RandomForestClassifier':RandomForestClassifier(),
    'ExtraTreesClassifier':ExtraTreesClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'SVC':SVC(),
    'LogisticRegression':LogisticRegression(),
    'GaussianNB':GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'MLPClassifier':MLPClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'lightgbm' : lgb.LGBMClassifier(),
    'XGBClassifier':XGBClassifier()
}

In [9]:
df = pd.read_csv('data\Churn_Modelling.csv')

  df = pd.read_csv('data\Churn_Modelling.csv')


In [10]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [11]:
df = df.drop(columns=['RowNumber','CustomerId','Surname'])

In [12]:
num_features = df.select_dtypes(include='int').columns.to_list()

In [13]:
num_features = num_features[:-1]

In [14]:
cat_features = df.select_dtypes(include='object').columns.to_list()

In [15]:
cat_features

['Geography', 'Gender']

In [16]:
cat_pipeline = Pipeline([
    ('Imputer',SimpleImputer(strategy='most_frequent')),
    ('OneHotEncoder',OneHotEncoder()),
    ('Standarize',StandardScaler(with_mean=False))
])

In [17]:
num_pipeline = Pipeline([
    ('Imputer',SimpleImputer(strategy='mean')),
    ('Standarize',StandardScaler(with_mean=False))
])

In [18]:
preprocessed = ColumnTransformer([
    ('cat_preprocessed',cat_pipeline,cat_features),
    ('num_preprocessed',num_pipeline,num_features)
])

In [1]:
with open('artifacts/preprocess.pkl','wb') as f:
    pickle.dump(preprocessed,f)

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/preprocess.pkl'

In [19]:
X = df.drop(columns=['Exited'])
y = df['Exited']

In [20]:
X_transformed = preprocessed.fit_transform(X)

In [21]:
ros = SMOTE() # SMOTE gives me worse performance for all alg than RandomOverSampling
X_transformed,y = ros.fit_resample(X_transformed,y) 

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X_transformed,y,test_size=0.2)

In [23]:
X_train.shape,y_train.shape,X_test.shape

((12740, 11), (12740,), (3186, 11))

In [24]:
# metrics
def metrics(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    return acc,recall, precision,f1
    


In [25]:
dagshub.init(repo_owner='Jakub_Jedrych', repo_name='ANN_Project', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/Jakub_Jedrych/ANN_Project.mlflow')
mlflow.set_experiment("Sklearn models tracking")

MlflowException: API request to https://dagshub.com/Jakub_Jedrych/ANN_Project.mlflow/api/2.0/mlflow/experiments/get failed with exception HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /Jakub_Jedrych/ANN_Project.mlflow/api/2.0/mlflow/experiments/get?experiment_id=None (Caused by ResponseError('too many 500 error responses'))

In [None]:

model_list = []
f1_list =[]
acc_list = []
prec_list = []
recall_list = []

for i in range(len(list(algorithms))):
    with mlflow.start_run():

        model =list(algorithms.values())[i]
        model.fit(X_train,y_train)

        

        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        met_train_acc,met_train_recall,met_train_precision,met_train_f1 = metrics(y_train,y_pred_train)
        met_test_acc,met_test_recall,met_test_precision,met_test_f1 = metrics(y_test,y_pred_test)


        print(list(algorithms.keys())[i])
        model_list.append(list(algorithms.keys())[i])
        
        print('Model performance for Training set')
        print("- F1 - score: {:.4f}".format(met_train_f1))
        print("- Precision: {:.4f}".format(met_train_precision))
        print("- Recall: {:.4f}".format(met_train_recall))
        print("- Accuracy score: {:.4f}".format(met_train_acc))

        print('----------------------------------')
        
        print('Model performance for Test set')
        print("- F1 - score: {:.4f}".format(met_test_f1))
        print("- Precision:  {:.4f}".format(met_test_precision))
        print("- Recall: {:.4f}".format(met_test_recall))
        print("- Accuracy score: {:.4f}".format(met_test_acc))
        f1_list.append(met_test_f1)
        recall_list.append(met_test_recall)
        prec_list.append(met_test_precision)
        acc_list.append(met_test_acc)

        
        print('='*35)
        print('\n')

NameError: name 'algorithms' is not defined

In [222]:
# cross_validate(algorithms['MLPClassifier'],X_transformed,y,cv=10,scoring=('f1',"accuracy"))



{'fit_time': array([6.30046129, 5.79396224, 4.02333045, 4.65869594, 4.84226608,
        3.94572234, 4.343647  , 2.91571093, 4.54523039, 3.56466866]),
 'score_time': array([0.00562358, 0.00625753, 0.00365496, 0.00203991, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]),
 'test_f1': array([0.76776119, 0.77877238, 0.78867676, 0.79098874, 0.77197621,
        0.80771624, 0.79263804, 0.8014661 , 0.78277886, 0.77615726]),
 'test_accuracy': array([0.75580665, 0.78279975, 0.79849341, 0.79033271, 0.7834275 ,
        0.80602637, 0.78768844, 0.79585427, 0.79082915, 0.77826633])}

In [223]:
df_algorithms = pd.DataFrame(
    data={
        'Algorithm':model_list,
        "F1-score":f1_list,
        "Recal":recall_list,
        "Precision":prec_list,
        "Accuracy":acc_list
    }).sort_values(by='F1-score',ascending=False)

In [224]:
df_algorithms

Unnamed: 0,Algorithm,F1-score,Recal,Precision,Accuracy
9,lightgbm,0.89597,0.874924,0.918054,0.89548
10,XGBClassifier,0.888889,0.863941,0.91532,0.888889
1,ExtraTreesClassifier,0.887004,0.893228,0.880866,0.882925
0,RandomForestClassifier,0.880952,0.880415,0.881491,0.877589
6,KNeighborsClassifier,0.83794,0.878585,0.80089,0.825173
8,DecisionTreeClassifier,0.829865,0.827334,0.832413,0.825487
7,MLPClassifier,0.800123,0.794997,0.805315,0.795669
3,SVC,0.785625,0.766931,0.805253,0.784683
2,AdaBoostClassifier,0.782395,0.775473,0.789441,0.778092
5,GaussianNB,0.72247,0.692495,0.755156,0.726303


In [228]:
df.Exited.value_counts(),y.value_counts()

(Exited
 0    7963
 1    2037
 Name: count, dtype: int64,
 Exited
 1    7963
 0    7963
 Name: count, dtype: int64)