In [2]:
import mlflow
import pandas as pd
from matplotlib import pyplot as plt

from hyperopt.pyll import scope
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from sklearn.pipeline import make_pipeline

from sklearn.metrics import (precision_score, recall_score,
                             f1_score, accuracy_score)

import warnings
warnings.filterwarnings('ignore')

In [3]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('Customer_Churn_Predictions')

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Customer-retention/Churn_Service/notebooks/mlruns/1', creation_time=1703338247948, experiment_id='1', last_update_time=1703338247948, lifecycle_stage='active', name='Customer_Churn_Predictions', tags={}>

In [16]:
path = '../data/processed_data/churn.csv'
data = pd.read_csv(path)

y = data['churn']
X = data.drop(['churn'], axis=1)
X = X.to_dict(orient="record")


(train_x, test_x, 
    train_y, test_y) = train_test_split(X, y, test_size= 0.3, random_state=1993)


In [17]:
def evaluate_model(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1score = f1_score(y_true, y_pred)

    out = {"accuracy_score" : accuracy, 
            "precision_score" :precision, 
            "recall_score" : recall, 
            "f1_score" : f1score}
    return out

In [22]:

# Linear Model
c_values = range(1, 100, 10)
for val in c_values:

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'Logistic Regression')
        mlflow.set_tag('C', val)

        lr_pipeline = make_pipeline(DictVectorizer(sparse= False),
                                    LogisticRegression(C =val))
        lr_pipeline.fit(train_x, train_y)

        test_pred = lr_pipeline.predict(test_x)
        test_output_eval = evaluate_model(test_y, test_pred)
        mlflow.log_metrics(test_output_eval)
        mlflow.sklearn.log_model(lr_pipeline, artifact_path="models_mlflow")

In [23]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'DecisionTree')
        mlflow.log_params(params)

        pipeline = make_pipeline(DictVectorizer(sparse=False),
                                DecisionTreeClassifier(**params))
        pipeline.fit(train_x, train_y)

        prediction = pipeline.predict(test_x)
        prediction_eval = evaluate_model(test_y, prediction)   
        
        mlflow.log_metrics(prediction_eval)
        mlflow.sklearn.log_model(pipeline, artifact_path="models_mlflow")
        
    return {"loss": -prediction_eval['f1_score'], 'status': STATUS_OK}

space = {"max_depth": hp.randint("max_depth", 1, 15),
        'min_samples_split': hp.randint("min_samples_split", 2, 15),
        'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
        "criterion": hp.choice("criterion", ["gini", "entropy"]),
        }

best_result = fmin(fn= objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials()
                    )

100%|██████████| 50/50 [02:23<00:00,  2.86s/trial, best loss: -0.6195165622202327]


In [24]:
def random_forest_objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'RandonForest')
        mlflow.log_params(params)

        pipeline = make_pipeline(DictVectorizer(sparse=False),
                                RandomForestClassifier(**params))
        pipeline.fit(train_x, train_y)

        prediction = pipeline.predict(test_x)
        prediction_eval = evaluate_model(test_y, prediction) 

        mlflow.log_metrics(prediction_eval)
        mlflow.sklearn.log_model(pipeline, artifact_path="models_mlflow")
        
    return {"loss": -prediction_eval['f1_score'], 'status': STATUS_OK}


space = {"n_estimators": hp.choice("n_estimators", [2,5,10, 20, 30, 50, 100,]),
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 5)),
            #'min_samples_split': hp.randint("min_samples_split", 2, 15),
           # 'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            }

best_result = fmin(fn=random_forest_objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials())

100%|██████████| 50/50 [02:28<00:00,  2.96s/trial, best loss: -0.5812441968430826]


In [27]:
class XGBoostTrainer():
    def __init__(self, params, num_boost_round=1000, early_stopping_rounds=50):
        self.params = params
        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds
        self.booster = None
        self.dict_vectorizer = DictVectorizer(sparse=False)

    def fit(self, X, y):
        # Assuming X, y are your training data and labels
        # Convert the input features to a sparse matrix using DictVectorizer
        
        X_sparse = self.dict_vectorizer.fit_transform(X)

        # Create xgb.DMatrix
        dtrain = xgb.DMatrix(X_sparse, label=y)

        # Train the XGBoost model
        self.booster = xgb.train(self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_boost_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=[(dtrain, 'train')],
                                 verbose_eval=50)
        mlflow.xgboost.log_model(self.booster, artifact_path='models_mlflow')

        return self

    def predict(self, X):
        # Convert the input features to a sparse matrix using DictVectorizer
        X_sparse = self.dict_vectorizer.transform(X)

        # Create xgb.DMatrix
        dmatrix = xgb.DMatrix(X_sparse)

        # Use the trained model for predictions
        predictions = self.booster.predict(dmatrix)
        return predictions



In [32]:
def objective(params):

    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag("model", "Xgboost")
        
        mlflow.log_params(params)
        
        booster =  make_pipeline(XGBoostTrainer(params = params))
        
        booster.fit(train_x, train_y)
        prediction = booster.predict(test_x)
        prediction = (prediction >= 0.5).astype('int')
        
        prediction_eval = evaluate_model(test_y, prediction)  
        mlflow.log_metrics(prediction_eval)
        

    return {'loss': -prediction_eval['f1_score'], 'status': STATUS_OK}


In [33]:
search_space = {
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 3)),
            'learning_rate': hp.loguniform('learning_rate', -3, 0),
            'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
            'objective': 'binary:logistic',  
            'eval_metric': 'logloss',                                             
            'seed': 42
                }

best_result = fmin(
                fn= objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=Trials()
                )

[0]	train-logloss:0.43521                             
[50]	train-logloss:0.19963                            
[100]	train-logloss:0.14533                           
[150]	train-logloss:0.11971                           
[200]	train-logloss:0.10268                           
[250]	train-logloss:0.09109                           
[300]	train-logloss:0.08331                           
[350]	train-logloss:0.07702                           
[400]	train-logloss:0.07219                           
[450]	train-logloss:0.06786                           
[500]	train-logloss:0.06426                           
[550]	train-logloss:0.06146                           
[600]	train-logloss:0.05897                           
[650]	train-logloss:0.05698                           
[700]	train-logloss:0.05501                           
[750]	train-logloss:0.05331                           
[800]	train-logloss:0.05182                           
[850]	train-logloss:0.05046                           
[900]	trai

job exception: 'Pipeline' object has no attribute 'save_model'



  0%|          | 0/50 [00:04<?, ?trial/s, best loss=?]


AttributeError: 'Pipeline' object has no attribute 'save_model'

In [12]:
vectorizer = DictVectorizer(sparse = False)
xgb_train_x = vectorizer.fit_transform(train_x)
xgb_test_x = vectorizer.transform(test_x)

xgb_train = xgb.DMatrix(xgb_train_x, label=train_y)
xgb_valid = xgb.DMatrix(xgb_test_x, label=test_y)


def objective(params):

    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag("model", "Xgboost")
        
        mlflow.log_params(params)
        
        booster = xgb.train(params=params,
                            dtrain=xgb_train,
                            num_boost_round=1000,
                            evals=[(xgb_valid, 'validation')],
                            early_stopping_rounds=50
                            )
        
        prediction = booster.predict(xgb_valid)
        prediction = (prediction >= 0.5).astype('int')
        
        prediction_eval = evaluate_model(test_y, prediction)  
        mlflow.log_metrics(prediction_eval)

    return {'loss': -prediction_eval['f1_score'], 'status': STATUS_OK}

search_space = {
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 3)),
            'learning_rate': hp.loguniform('learning_rate', -3, 0),
            'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
            'objective': 'binary:logistic',  
            'eval_metric': 'logloss',                                             
            'seed': 42
                }

best_result = fmin(
                fn= objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=Trials()
                )

[0]	validation-logloss:0.47209                        
[1]	validation-logloss:0.47274                        
[2]	validation-logloss:0.49071                        
[3]	validation-logloss:0.50437                        
[4]	validation-logloss:0.51929                        
[5]	validation-logloss:0.52745                        
[6]	validation-logloss:0.53896                        
[7]	validation-logloss:0.55542                        
[8]	validation-logloss:0.56864                        
[9]	validation-logloss:0.57801                        
[10]	validation-logloss:0.58713                       
[11]	validation-logloss:0.59582                       
[12]	validation-logloss:0.60239                       
[13]	validation-logloss:0.61130                       
[14]	validation-logloss:0.61976                       
[15]	validation-logloss:0.62714                       
[16]	validation-logloss:0.63310                       
[17]	validation-logloss:0.63963                       
[18]	valid

In [18]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [19]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.f1_score >0.595",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.f1_score ASC"]
)

In [21]:
for run in runs:
    print(f"run id: {run.info.run_id}, F1 Score: {run.data.metrics['f1_score']:.4f}")

run id: e5744ff8fb8f4e428bb139004150dc9f, F1 Score: 0.5951
run id: 16d9a7136002485fa7d23ecce829e1f8, F1 Score: 0.5958


In [22]:
for run in runs:
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name="Custormer-churn-models")

Successfully registered model 'Custormer-churn-models'.
Created version '1' of model 'Custormer-churn-models'.
Registered model 'Custormer-churn-models' already exists. Creating a new version of this model...
Created version '2' of model 'Custormer-churn-models'.


In [65]:
model_name = "Custormer-churn-models"
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 5, stage: None


In [70]:
model_version = 5
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1692104278613, current_stage='Staging', description=None, last_updated_timestamp=1692104474582, name='Custormer-churn-models', run_id='eb579b6644f94306b371e1dbfb884a44', run_link=None, source='/home/godwin/Documents/Workflow/Churn-Prediction-in-a-Telecom-Company/mlruns/1/eb579b6644f94306b371e1dbfb884a44/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [73]:
from datetime import datetime


new_stage = 'Staging'
version = 5
date = datetime.today().date() 

client.update_model_version(
    name=model_name,
    version=version,
    description=f"The model version {version}  was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1692104278613, current_stage='Staging', description='The model version 5  was transitioned to Staging on 2023-08-15', last_updated_timestamp=1692104613905, name='Custormer-churn-models', run_id='eb579b6644f94306b371e1dbfb884a44', run_link=None, source='/home/godwin/Documents/Workflow/Churn-Prediction-in-a-Telecom-Company/mlruns/1/eb579b6644f94306b371e1dbfb884a44/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>