In [2]:
# import the necessary libraries
import pandas as pd
import dagshub
import mlflow
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, roc_auc_score, roc_curve, auc,precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.models import load_model
from scikeras.wrappers import KerasClassifier
import datetime
import pickle
import os
import warnings
warnings.filterwarnings("ignore")

In [3]:
# load the dataset
df = pd.read_csv('churn_Modelling.csv')

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
def drop_columns(columns: list):
    """
    This function drops the columns from the dataset
    """
    df.drop(columns=columns, axis=1, inplace=True)
    return df

In [6]:
def preprocess_data(df):
    """
    Function to preprocess the data
    Args:
    df: the dataset to be preprocessed
    Returns:
    df: the preprocessed data
    """
    
    # load the label encoder
    with open('encoder/label_encoder.pkl','rb') as f:
        label_encoder = pickle.load(f)
    # load the one hot encoder
    with open('encoder/one_hot_encoder.pkl','rb') as f:
        one_hot_encoder = pickle.load(f)
    
  
    # label encode the Gender column
    df['Gender'] = label_encoder.transform(df['Gender'])
    # one hot encode the Geography column
    geography_encoded = one_hot_encoder.transform(df[['Geography']])
    # convert the one hot encoded data to dataframe
    geography_encoded_df = pd.DataFrame(geography_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['Geography']))
    # drop the Geography column
    df.drop(['Geography'], axis=1, inplace=True)
    # concatenate the dataframes
    df = pd.concat([geography_encoded_df,df], axis=1)
    # separate the features and target variable
    X = df.drop(['Exited'], axis=1)
    y = df['Exited']
    return X,y

In [7]:
def split_dataset(X, y):
    """
    Function to split the dataset to train and test data
    Args:
    X: the features
    y: the target variable
    Returns:
    X_train: the training data
    X_test: the testing data
    y_train: the training target
    y_test: the testing target
    """
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [8]:
# drop the columns
columns = ['RowNumber', 'CustomerId', 'Surname']
df = drop_columns(columns)

In [9]:
df.head()


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
# preprocess the data
X,y = preprocess_data(df)

In [11]:
X.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.0,0.0,0.0,619,0,42,2,0.0,1,1,1,101348.88
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58
2,1.0,0.0,0.0,502,0,42,8,159660.8,3,1,0,113931.57
3,1.0,0.0,0.0,699,0,39,1,0.0,2,0,0,93826.63
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.1


In [12]:
# split the data
X_train, X_test, y_train, y_test = split_dataset(X, y)

In [13]:
X_train.shape , X_test.shape

((8000, 12), (2000, 12))

In [14]:
# load the scaler
with open('encoder/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# scale the data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [15]:
# Hyperparameter tuning for ML models
param_grid = {
    
    'CatBoost Classifier': {
        'iterations': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1],
        'depth': [2, 4, 6],
        'l2_leaf_reg': [3, 5, 7],

    },
    'LGBM Classifier': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1],
        'max_depth': [2, 4, 6],
        'num_leaves': [3, 5, 7],

    },
    'ANN': {
        'batch_size': [32, 64, 128],
        'epochs': [50, 100, 200],
        'optimizer': ['adam', 'rmsprop'],
        'activation': ['relu', 'tanh'],
        'neurons': [32, 64, 128],
        'layers': [1, 2, 3]
    }
}

In [16]:
def create_ann(hiddenLayerOne=32, hiddenLayerTwo=16):
    """
    Function to create an ANN model
    Args:
    units: the number of neurons in the hidden layers
    layers: the number of hidden layers
    Returns:
    model: the ANN model
    """
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(hiddenLayerOne, activation='relu'))
    model.add(Dense(hiddenLayerTwo, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [17]:
# create a function to perform hyperparameter tuning
def tuner(model_name, param_grid):
    """
    Function to perform hyperparameter tuning
    Args:
    model_name: the name of the model
    param_grid: the hyperparameters to tune
    Returns:
    model: the tuned model
    """
    try:
        if model_name == 'CatBoost Classifier':
            model = CatBoostClassifier()
        elif model_name == 'LGBM Classifier':
            model = LGBMClassifier()
        elif model_name == 'ANN':
            model = KerasClassifier(build_fn=create_ann, epochs=50,batch_size=32, verbose=0)
        grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_results = grid.fit(X_train, y_train)
        return grid_results
    except Exception as e:
        print(f'An error occured: {e}') 
    

In [18]:
for model_name, params in param_grid.items():
    grid_results = tuner(model_name, params)
    if grid_results is not None:
        print(f"Best Parameters for {model_name}: {grid_results.best_params_}")
        print(f"Best Score for {model_name}: {grid_results.best_score_}")
        print("\n")
    else:
        print(f"Tuning failed for {model_name}")

Fitting 3 folds for each of 81 candidates, totalling 243 fits
0:	learn: 0.6339589	total: 140ms	remaining: 27.8s
1:	learn: 0.5878003	total: 144ms	remaining: 14.2s
2:	learn: 0.5524540	total: 148ms	remaining: 9.72s
3:	learn: 0.5237152	total: 153ms	remaining: 7.47s
4:	learn: 0.5005923	total: 157ms	remaining: 6.11s
5:	learn: 0.4793178	total: 161ms	remaining: 5.2s
6:	learn: 0.4647263	total: 165ms	remaining: 4.54s
7:	learn: 0.4505277	total: 169ms	remaining: 4.05s
8:	learn: 0.4394210	total: 173ms	remaining: 3.67s
9:	learn: 0.4312815	total: 178ms	remaining: 3.38s
10:	learn: 0.4229297	total: 182ms	remaining: 3.13s
11:	learn: 0.4161392	total: 187ms	remaining: 2.92s
12:	learn: 0.4102758	total: 191ms	remaining: 2.75s
13:	learn: 0.4061370	total: 198ms	remaining: 2.62s
14:	learn: 0.4022349	total: 201ms	remaining: 2.48s
15:	learn: 0.3986165	total: 206ms	remaining: 2.36s
16:	learn: 0.3949859	total: 213ms	remaining: 2.29s
17:	learn: 0.3911111	total: 218ms	remaining: 2.2s
18:	learn: 0.3878669	total: 222m

In [19]:
# Train the model based the best hyperparameter tuning
catboost_params = {'depth': 2, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
lgbm_params = {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200, 'num_leaves': 5}

In [20]:
catboost = CatBoostClassifier(**catboost_params)
lgbm = LGBMClassifier(**lgbm_params)

In [21]:
catboost.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

0:	learn: 0.6339589	total: 4.14ms	remaining: 823ms
1:	learn: 0.5878003	total: 8.06ms	remaining: 798ms
2:	learn: 0.5524540	total: 11.9ms	remaining: 778ms
3:	learn: 0.5237152	total: 16.3ms	remaining: 801ms
4:	learn: 0.5005923	total: 20.2ms	remaining: 787ms
5:	learn: 0.4793178	total: 24ms	remaining: 777ms
6:	learn: 0.4647263	total: 28.4ms	remaining: 783ms
7:	learn: 0.4505277	total: 32.7ms	remaining: 785ms
8:	learn: 0.4394210	total: 36.7ms	remaining: 778ms
9:	learn: 0.4312815	total: 40.9ms	remaining: 777ms
10:	learn: 0.4229297	total: 45.2ms	remaining: 777ms
11:	learn: 0.4161392	total: 49.2ms	remaining: 771ms
12:	learn: 0.4102758	total: 53.1ms	remaining: 764ms
13:	learn: 0.4061370	total: 57.5ms	remaining: 764ms
14:	learn: 0.4022349	total: 61.7ms	remaining: 761ms
15:	learn: 0.3986165	total: 65.7ms	remaining: 755ms
16:	learn: 0.3949859	total: 69.6ms	remaining: 749ms
17:	learn: 0.3911111	total: 73.5ms	remaining: 743ms
18:	learn: 0.3878669	total: 78.2ms	remaining: 745ms
19:	learn: 0.3856956	tot

47:	learn: 0.3532635	total: 204ms	remaining: 646ms
48:	learn: 0.3527103	total: 208ms	remaining: 640ms
49:	learn: 0.3518823	total: 212ms	remaining: 636ms
50:	learn: 0.3513213	total: 216ms	remaining: 631ms
51:	learn: 0.3506561	total: 220ms	remaining: 627ms
52:	learn: 0.3498204	total: 224ms	remaining: 621ms
53:	learn: 0.3494849	total: 228ms	remaining: 617ms
54:	learn: 0.3490053	total: 232ms	remaining: 613ms
55:	learn: 0.3480284	total: 237ms	remaining: 610ms
56:	learn: 0.3475516	total: 241ms	remaining: 605ms
57:	learn: 0.3472124	total: 245ms	remaining: 600ms
58:	learn: 0.3468586	total: 249ms	remaining: 596ms
59:	learn: 0.3465075	total: 254ms	remaining: 592ms
60:	learn: 0.3463076	total: 258ms	remaining: 588ms
61:	learn: 0.3461084	total: 262ms	remaining: 582ms
62:	learn: 0.3458889	total: 266ms	remaining: 579ms
63:	learn: 0.3455766	total: 270ms	remaining: 574ms
64:	learn: 0.3453718	total: 274ms	remaining: 569ms
65:	learn: 0.3451605	total: 278ms	remaining: 565ms
66:	learn: 0.3448083	total: 283

In [22]:
catboost_pred = catboost.predict(X_test)
lgbm_pred = lgbm.predict(X_test)

In [23]:
catboost_accuracy = accuracy_score(y_test, catboost_pred)
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)

In [24]:

dagshub.init(repo_owner='labdul749', repo_name='churn_modelling-end-to_end-project-', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=8ac1509b-d78f-4999-b1f2-b5e4be14c07a&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=9e28ad27c8c87d3748240057a2ff9f64a64641315e14c448df95a096b79936cb




Output()

In [25]:
from mlflow.models import infer_signature

In [29]:
mlflow.set_tracking_uri('https://dagshub.com/labdul749/churn_modelling-end-to_end-project-.mlflow')

In [30]:
mlflow.set_experiment('Churn Modelling')

2025/01/16 17:25:59 INFO mlflow.tracking.fluent: Experiment with name 'Churn Modelling' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/36dc290718dc451fba061a4ab72709f8', creation_time=1737028559045, experiment_id='0', last_update_time=1737028559045, lifecycle_stage='active', name='Churn Modelling', tags={}>

In [31]:
with mlflow.start_run():
    mlflow.log_param('CatBoost Classifier', catboost_params)
    mlflow.log_metric('CatBoost Classifier Accuracy', catboost_accuracy)
    mlflow.set_tag('CatBoost Classifier', 'Best Model')
    signature = infer_signature(X_train, catboost_pred)

    # log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = catboost,
        artifact_path = 'catboost-model',
        registered_model_name = 'catboost-model',
        signature = signature,
        input_example = X_train,
    )

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'catboost-model'.
2025/01/16 17:27:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: catboost-model, version 1
Created version '1' of model 'catboost-model'.


🏃 View run clean-smelt-31 at: https://dagshub.com/labdul749/churn_modelling-end-to_end-project-.mlflow/#/experiments/0/runs/c8f9cb370665409fb899feb2584a71f6
🧪 View experiment at: https://dagshub.com/labdul749/churn_modelling-end-to_end-project-.mlflow/#/experiments/0


In [36]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [38]:
loaded_model.predict(X_test[-5:])

array([1, 0, 1, 0, 0], dtype=int64)