# Training Models

### Load libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Binarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
import joblib
import warnings
from datetime import datetime
from dotenv import load_dotenv, find_dotenv
warnings.filterwarnings("ignore")

_ = load_dotenv(find_dotenv())

## Load dataset

In [2]:
base_df = pd.read_csv(f"../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
base_df.shape

(7043, 21)

## Preprocessing dataset

In [3]:
# Defining categorical, numerical and binary columns
cat_cols = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 
            'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
            'StreamingMovies', 'Contract', 'PaymentMethod']

num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

bin_cols = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']

target_col = ['Churn']

In [4]:
# Checking for duplicates and null values and creating new columns
dataset = (base_df
 .drop_duplicates(keep='first')
 .drop(index=base_df[base_df['TotalCharges'] == ' '].index)
 [cat_cols + num_cols + bin_cols + target_col]
 .assign(Churn = lambda x: x.Churn.map({'Yes': 1, 'No': 0}),
        Partner = lambda x: x.Partner.map({'Yes': 1, 'No': 0}),
        Dependents = lambda x: x.Dependents.map({'Yes': 1, 'No': 0}),
        PhoneService = lambda x: x.PhoneService.map({'Yes': 1, 'No': 0}),
        PaperlessBilling = lambda x: x.PaperlessBilling.map({'Yes': 1, 'No': 0})
        )
)

In [5]:
dataset.to_csv("../data/telco_customer_churn_reference.csv", index=False)

In [6]:
# Define the preprocessing steps
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ('binarizer', Binarizer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols),
        ('num', numerical_transformer, num_cols),
        ('bin', binary_transformer, bin_cols)
    ]).set_output(transform="pandas")

processed_data = preprocessor.fit_transform(dataset)

preprocessor_file = "./artifacts/preprocessing_pipeline.pkl"
joblib.dump(preprocessor, preprocessor_file)

['./artifacts/preprocessing_pipeline.pkl']

In [7]:
X = processed_data
y = dataset[target_col]

In [8]:
X

Unnamed: 0,cat__gender_Female,cat__gender_Male,cat__MultipleLines_No,cat__MultipleLines_No phone service,cat__MultipleLines_Yes,cat__InternetService_DSL,cat__InternetService_Fiber optic,cat__InternetService_No,cat__OnlineSecurity_No,cat__OnlineSecurity_No internet service,...,cat__PaymentMethod_Electronic check,cat__PaymentMethod_Mailed check,num__tenure,num__MonthlyCharges,num__TotalCharges,bin__SeniorCitizen,bin__Partner,bin__Dependents,bin__PhoneService,bin__PaperlessBilling
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,-1.280248,-1.161694,-0.994194,0,1,0,0,1
1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.064303,-0.260878,-0.173740,0,0,0,1,0
2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,-1.239504,-0.363923,-0.959649,0,0,0,1,1
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.512486,-0.747850,-0.195248,0,0,0,0,0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,-1.239504,0.196178,-0.940457,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,-0.343137,0.664868,-0.129180,0,1,1,1,1
7039,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.612573,1.276493,2.241056,0,1,1,1,1
7040,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-0.872808,-1.170004,-0.854514,0,1,1,0,1
7041,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,-1.158016,0.319168,-0.872095,1,1,0,1,1


## Training Models

In [9]:
# Set the tracking URI for MLflow
mlflow.set_tracking_uri("http://localhost:5000")

In [10]:
# Set the experiment name 
mlflow.set_experiment("telco_customer_churn")

2024/09/28 02:27:12 INFO mlflow.tracking.fluent: Experiment with name 'telco_customer_churn' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1727504832264, experiment_id='1', last_update_time=1727504832264, lifecycle_stage='active', name='telco_customer_churn', tags={}>

In [11]:
# Set the autologging configuration to True for all sklearn models
mlflow.sklearn.autolog()



In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Save the training and testing data as CSV files
train_data = pd.DataFrame(X_train, columns=X.columns)
train_data[target_col] = y_train.reset_index(drop=True)
train_data.to_csv('./artifacts/train_data.csv', index=False)

test_data = pd.DataFrame(X_test, columns=X.columns)
test_data[target_col] = y_test.reset_index(drop=True)
test_data.to_csv('./artifacts/test_data.csv', index=False)

In [14]:
# Define the models to train 
models = {
    "Logistic Regression": LogisticRegression(random_state=11),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, random_state=11),
    "Random Forest": RandomForestClassifier(max_depth=10, random_state=11),
    "SVC": SVC(random_state = 11),
    "KNN": KNeighborsClassifier(n_neighbors = 11),
    "Gaussian Naive Bayes": GaussianNB()
}

In [15]:
# Train the models and evaluate their performance
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name} [{(datetime.now().strftime('%Y%m%d-%H%M%S'))}]"):
        mlflow.log_artifact(preprocessor_file, artifact_path='preprocessing')
        mlflow.log_artifact('./artifacts/train_data.csv', artifact_path='train_data')
        mlflow.log_artifact('./artifacts/test_data.csv', artifact_path='test_data')
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        signature = infer_signature(X_test, y_pred)
        
        report = classification_report(y_test, y_pred)
        mlflow.log_text(str(report), "classification_report.txt")
                
        if hasattr(model, 'feature_importances_'):
            feature_importances = model.feature_importances_
            importance_df = pd.DataFrame({
                'Feature': X.columns,
                'Importance': feature_importances
            }).sort_values(by='Importance', ascending=False)
            importance_df.to_csv(f'./artifacts/feature_importances_{name}.csv', index=False)
            mlflow.log_artifact(f'./artifacts/feature_importances_{name}.csv')
        elif hasattr(model, 'coef_'):
            feature_importances = np.abs(model.coef_[0])
            importance_df = pd.DataFrame({
                'Feature': X.columns,
                'Importance': feature_importances
            }).sort_values(by='Importance', ascending=False)
            importance_df.to_csv(f'./artifacts/feature_importances_{name}.csv', index=False)
            mlflow.log_artifact(f'./artifacts/feature_importances_{name}.csv')

        run = mlflow.active_run()
        print("Active run_id: {}".format(run.info.run_id))
        
    mlflow.end_run()
        
print("Training and evaluation finished.")

Active run_id: 812d4d29aae44c91b67c5bfe10384908
Active run_id: 55ea6d8acb0a4089a66f30458e876445
Active run_id: b3cddea95097435b9be7eea2edd94b52
Active run_id: 8bf66d8f732b4979a6e7b80b291282e7
Active run_id: b649c1b6f92e43d29ff5c80fce5c8500
Active run_id: 7e78e253a49d41ee8dd5f83a31930c1d
Training and evaluation finished.


## Manage Models

In [16]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [17]:
client = MlflowClient(tracking_uri='http://localhost:5000')

client.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1727504832264, experiment_id='1', last_update_time=1727504832264, lifecycle_stage='active', name='telco_customer_churn', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1727504613810, experiment_id='0', last_update_time=1727504613810, lifecycle_stage='active', name='Default', tags={}>]

In [18]:
runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.training_score >0.7",
    run_view_type=ViewType.ACTIVE_ONLY,
    order_by=["metrics.training_score DESC"]
)

In [19]:
lst_runs = []

for run in runs:
    row = [run.info.run_id, run.info.run_name, run.data.tags['estimator_name'], run.info.start_time, run.data.metrics['training_score']]
    lst_runs.append(row)
    
df_runs = pd.DataFrame(lst_runs, columns=['run_id', 'run_name', 'estimator_name', 'start_time', 'training_score'])  

In [20]:
df_runs

Unnamed: 0,run_id,run_name,estimator_name,start_time,training_score
0,b3cddea95097435b9be7eea2edd94b52,Random Forest [20240928-022728],RandomForestClassifier,1727504848635,0.882974
1,55ea6d8acb0a4089a66f30458e876445,Decision Tree [20240928-022723],DecisionTreeClassifier,1727504843867,0.882162
2,b649c1b6f92e43d29ff5c80fce5c8500,KNN [20240928-022743],KNeighborsClassifier,1727504863610,0.823446
3,8bf66d8f732b4979a6e7b80b291282e7,SVC [20240928-022734],SVC,1727504854293,0.819789
4,812d4d29aae44c91b67c5bfe10384908,Logistic Regression [20240928-022713],LogisticRegression,1727504833820,0.806583
5,7e78e253a49d41ee8dd5f83a31930c1d,Gaussian Naive Bayes [20240928-022749],GaussianNB,1727504869889,0.700731


## Building Final Model

In [21]:
final_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('classifier', RandomForestClassifier(max_depth=10, random_state=11))
])

In [22]:
X = dataset.drop(columns=target_col)
y = dataset[target_col]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
with mlflow.start_run(run_name=f"FinalModelTraining [{(datetime.now().strftime('%Y%m%d-%H%M%S'))}]"):
    #mlflow.log_text()
    final_model_pipeline.fit(X_train, y_train)
    y_pred = final_model_pipeline.predict(X_test)
    report = classification_report(y_test, y_pred)
    mlflow.log_text(str(report), "classification_report.txt")
    run = mlflow.active_run()
    print("Active run_id: {}".format(run.info.run_id))        
mlflow.end_run()

Active run_id: e29036cc1cb64393a6d8cd302243b3f9


## Register Model

In [25]:
runs = client.search_runs(
    experiment_ids='1',
    filter_string="`run name` LIKE 'FinalModelTraining%'",
    run_view_type=ViewType.ACTIVE_ONLY,
    order_by=["attribute.start_time DESC"],
    max_results=1
)

for run in runs:
    print(run.info.run_id, run.info.run_name, run.data.tags['estimator_name'], run.info.start_time, run.data.metrics['training_score'])

e29036cc1cb64393a6d8cd302243b3f9 FinalModelTraining [20240928-022754] Pipeline 1727504874805 0.8835839089800894


In [26]:
run_id = run.info.run_id
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="telco_customer_churn")

Successfully registered model 'telco_customer_churn'.
2024/09/28 02:32:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: telco_customer_churn, version 1
Created version '1' of model 'telco_customer_churn'.


<ModelVersion: aliases=[], creation_timestamp=1727505146493, current_stage='None', description='', last_updated_timestamp=1727505146493, name='telco_customer_churn', run_id='e29036cc1cb64393a6d8cd302243b3f9', run_link='', source='mlflow-artifacts:/1/e29036cc1cb64393a6d8cd302243b3f9/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [27]:
model_name = "telco_customer_churn"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


In [28]:
model_version = "1"
new_stage = "Production"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1727505146493, current_stage='Production', description='', last_updated_timestamp=1727505146626, name='telco_customer_churn', run_id='e29036cc1cb64393a6d8cd302243b3f9', run_link='', source='mlflow-artifacts:/1/e29036cc1cb64393a6d8cd302243b3f9/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [29]:
client.delete_registered_model_alias(model_name, "champion")

In [30]:
client.set_registered_model_alias(model_name, "champion", model_version)

In [31]:
client.set_model_version_tag(model_name, version=model_version, key="model_type", value="RandomForestClassifier")

In [32]:
model_name = "telco_customer_churn"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    #print(version)
    print(f"version: {version.version}, aliases: {version.aliases}, tags: {version.tags}, current_stage: {version.current_stage}")

version: 1, aliases: [], tags: {'model_type': 'RandomForestClassifier'}, current_stage: Production
