In [9]:
# import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, roc_auc_score, roc_curve, auc,precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.models import load_model
from scikeras.wrappers import KerasClassifier
import datetime
import pickle
import os
import warnings
warnings.filterwarnings("ignore")

In [10]:
# load the dataset
df = pd.read_csv('churn_Modelling.csv')

In [11]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [12]:
def drop_columns(columns: list):
    """
    This function drops the columns from the dataset
    """
    df.drop(columns=columns, axis=1, inplace=True)
    return df

In [13]:
def preprocess_data(df):
    """
    Function to preprocess the data
    Args:
    df: the dataset to be preprocessed
    Returns:
    df: the preprocessed data
    """
    
    # load the label encoder
    with open('encoder/label_encoder.pkl','rb') as f:
        label_encoder = pickle.load(f)
    # load the one hot encoder
    with open('encoder/one_hot_encoder.pkl','rb') as f:
        one_hot_encoder = pickle.load(f)
    
  
    # label encode the Gender column
    df['Gender'] = label_encoder.transform(df['Gender'])
    # one hot encode the Geography column
    geography_encoded = one_hot_encoder.transform(df[['Geography']])
    # convert the one hot encoded data to dataframe
    geography_encoded_df = pd.DataFrame(geography_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['Geography']))
    # drop the Geography column
    df.drop(['Geography'], axis=1, inplace=True)
    # concatenate the dataframes
    df = pd.concat([geography_encoded_df,df], axis=1)
    # separate the features and target variable
    X = df.drop(['Exited'], axis=1)
    y = df['Exited']
    return X,y

In [14]:
def split_dataset(X, y):
    """
    Function to split the dataset to train and test data
    Args:
    X: the features
    y: the target variable
    Returns:
    X_train: the training data
    X_test: the testing data
    y_train: the training target
    y_test: the testing target
    """
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [15]:
# drop the columns
columns = ['RowNumber', 'CustomerId', 'Surname']
df = drop_columns(columns)

In [16]:
df.head()


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [17]:
# preprocess the data
X,y = preprocess_data(df)

In [18]:
X.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.0,0.0,0.0,619,0,42,2,0.0,1,1,1,101348.88
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58
2,1.0,0.0,0.0,502,0,42,8,159660.8,3,1,0,113931.57
3,1.0,0.0,0.0,699,0,39,1,0.0,2,0,0,93826.63
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.1


In [19]:
# split the data
X_train, X_test, y_train, y_test = split_dataset(X, y)

In [20]:
X_train.shape , X_test.shape

((8000, 12), (2000, 12))

In [21]:
# load the scaler
with open('encoder/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# scale the data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [22]:
# Hyperparameter tuning for ML models
param_grid = {
    
    'CatBoost Classifier': {
        'iterations': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1],
        'depth': [2, 4, 6],
        'l2_leaf_reg': [3, 5, 7],

    },
    'LGBM Classifier': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4, 6],
        'num_leaves': [3, 5, 7],

    },
    'ANN': {
        'batch_size': [32, 64, 128],
        'epochs': [50, 100, 200],
        'optimizer': ['adam', 'rmsprop'],
        'activation': ['relu', 'tanh'],
        'neurons': [32, 64, 128],
        'layers': [1, 2, 3]
    }
}

In [23]:
def create_ann(hiddenLayerOne=32, hiddenLayerTwo=16):
    """
    Function to create an ANN model
    Args:
    units: the number of neurons in the hidden layers
    layers: the number of hidden layers
    Returns:
    model: the ANN model
    """
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(hiddenLayerOne, activation='relu'))
    model.add(Dense(hiddenLayerTwo, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [24]:
# create a function to perform hyperparameter tuning
def tuner(model_name, param_grid):
    """
    Function to perform hyperparameter tuning
    Args:
    model_name: the name of the model
    param_grid: the hyperparameters to tune
    Returns:
    model: the tuned model
    """
    try:
        if model_name == 'CatBoost Classifier':
            model = CatBoostClassifier()
        elif model_name == 'LGBM Classifier':
            model = LGBMClassifier()
        elif model_name == 'ANN':
            model = KerasClassifier(build_fn=create_ann, epochs=50,batch_size=32, verbose=0)
        grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_results = grid.fit(X_train, y_train)
        return grid_results
    except Exception as e:
        print(f'An error occured: {e}') 
    

In [25]:
for model_name, params in param_grid.items():
    grid_results = tuner(model_name, params)
    if grid_results is not None:
        print(f"Best Parameters for {model_name}: {grid_results.best_params_}")
        print(f"Best Score for {model_name}: {grid_results.best_score_}")
        print("\n")
    else:
        print(f"Tuning failed for {model_name}")

Fitting 3 folds for each of 81 candidates, totalling 243 fits
0:	learn: 0.6339589	total: 143ms	remaining: 28.4s
1:	learn: 0.5878003	total: 147ms	remaining: 14.6s
2:	learn: 0.5524540	total: 159ms	remaining: 10.4s
3:	learn: 0.5237152	total: 163ms	remaining: 8s
4:	learn: 0.5005923	total: 182ms	remaining: 7.1s
5:	learn: 0.4793178	total: 187ms	remaining: 6.06s
6:	learn: 0.4647263	total: 200ms	remaining: 5.51s
7:	learn: 0.4505277	total: 210ms	remaining: 5.04s
8:	learn: 0.4394210	total: 225ms	remaining: 4.78s
9:	learn: 0.4312815	total: 230ms	remaining: 4.37s
10:	learn: 0.4229297	total: 235ms	remaining: 4.04s
11:	learn: 0.4161392	total: 240ms	remaining: 3.75s
12:	learn: 0.4102758	total: 244ms	remaining: 3.51s
13:	learn: 0.4061370	total: 249ms	remaining: 3.31s
14:	learn: 0.4022349	total: 253ms	remaining: 3.11s
15:	learn: 0.3986165	total: 256ms	remaining: 2.95s
16:	learn: 0.3949859	total: 260ms	remaining: 2.8s
17:	learn: 0.3911111	total: 265ms	remaining: 2.68s
18:	learn: 0.3878669	total: 269ms	r

In [28]:
# Train the model based the best hyperparameter tuning
catboost_params = {'depth': 2, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
lgbm_params = {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200, 'num_leaves': 5}

In [29]:
catboost = CatBoostClassifier(**catboost_params)
lgbm = LGBMClassifier(**lgbm_params)

In [30]:
catboost.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

0:	learn: 0.6339589	total: 6.43ms	remaining: 1.28s
1:	learn: 0.5878003	total: 13.3ms	remaining: 1.32s
2:	learn: 0.5524540	total: 17.4ms	remaining: 1.14s
3:	learn: 0.5237152	total: 22.1ms	remaining: 1.08s
4:	learn: 0.5005923	total: 29.3ms	remaining: 1.14s
5:	learn: 0.4793178	total: 32.9ms	remaining: 1.06s
6:	learn: 0.4647263	total: 37.5ms	remaining: 1.03s
7:	learn: 0.4505277	total: 42.6ms	remaining: 1.02s
8:	learn: 0.4394210	total: 46.1ms	remaining: 979ms
9:	learn: 0.4312815	total: 50.1ms	remaining: 952ms
10:	learn: 0.4229297	total: 53.7ms	remaining: 923ms
11:	learn: 0.4161392	total: 57.9ms	remaining: 907ms
12:	learn: 0.4102758	total: 61.4ms	remaining: 883ms
13:	learn: 0.4061370	total: 65.3ms	remaining: 868ms
14:	learn: 0.4022349	total: 68.3ms	remaining: 843ms
15:	learn: 0.3986165	total: 71.6ms	remaining: 824ms
16:	learn: 0.3949859	total: 75.8ms	remaining: 816ms
17:	learn: 0.3911111	total: 79ms	remaining: 799ms
18:	learn: 0.3878669	total: 82.5ms	remaining: 786ms
19:	learn: 0.3856956	tot

In [31]:
catboost_pred = catboost.predict(X_test)
lgbm_pred = lgbm.predict(X_test)

In [32]:
catboost_accuracy = accuracy_score(y_test, catboost_pred)
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)

In [2]:
import mlflow

In [3]:
from mlflow.models import infer_signature

In [6]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [8]:
mlflow.set_experiment('Churn Modelling')

2025/01/15 16:26:05 INFO mlflow.tracking.fluent: Experiment with name 'Churn Modelling' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/320454728814694453', creation_time=1736938565799, experiment_id='320454728814694453', last_update_time=1736938565799, lifecycle_stage='active', name='Churn Modelling', tags={}>

In [35]:
with mlflow.start_run():
    mlflow.log_param('CatBoost Classifier', catboost_params)
    mlflow.log_metric('CatBoost Classifier Accuracy', catboost_accuracy)
    mlflow.set_tag('CatBoost Classifier', 'Best Model')
    signature = infer_signature(X_train, catboost_pred)

    # log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = catboost,
        artifact_path = 'catboost-model',
        registered_model_name = 'catboost-model',
        signature = signature,
        input_example = X_train,
    )

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'catboost-model'.
2025/01/15 16:53:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: catboost-model, version 1


🏃 View run useful-wolf-758 at: http://127.0.0.1:5000/#/experiments/320454728814694453/runs/5dca196307d14d199a1aa93ee2289c2e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/320454728814694453


Created version '1' of model 'catboost-model'.


In [36]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [38]:
loaded_model.predict(X_test[-5:])

array([1, 0, 1, 0, 0], dtype=int64)