In [8]:
import mlflow
import pandas as pd
from matplotlib import pyplot as plt

from hyperopt.pyll import scope
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from sklearn.pipeline import make_pipeline

from sklearn.metrics import (precision_score, recall_score,
                             f1_score, accuracy_score)

import warnings
warnings.filterwarnings('ignore')

In [10]:
mlflow.set_tracking_uri('sqlite:///../../databases/mlflow.db')
mlflow.set_experiment('Customer_Churn_Predictions')

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Customer-retention/ChurnGuard/notebooks/mlruns/1', creation_time=1719532937208, experiment_id='1', last_update_time=1719532937208, lifecycle_stage='active', name='Customer_Churn_Predictions', tags={}>

In [11]:
path = '../../data/churn-data/data/processed_data/churn.csv'
data = pd.read_csv(path)

y = data['churn']
X = data.drop(['churn'], axis=1)
X = X.to_dict(orient="record")


(train_x, test_x, 
    train_y, test_y) = train_test_split(X, y, test_size= 0.3, random_state=1993)

In [12]:
def evaluate_model(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1score = f1_score(y_true, y_pred)

    out = {"accuracy_score" : accuracy, 
            "precision_score" :precision, 
            "recall_score" : recall, 
            "f1_score" : f1score}
    return out

In [13]:
# Linear Model
c_values = range(1, 50, 5)
for val in c_values:

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'Logistic Regression')
        mlflow.log_param('C', val)

        lr_pipeline = make_pipeline(DictVectorizer(sparse= False),
                                    LogisticRegression(C =val))
        lr_pipeline.fit(train_x, train_y)

        test_pred = lr_pipeline.predict(test_x)
        test_output_eval = evaluate_model(test_y, test_pred)
        mlflow.log_metrics(test_output_eval)
        # mlflow.sklearn.log_model(lr_pipeline, artifact_path="models_mlflow")

In [14]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'DecisionTree')
        mlflow.log_params(params)

        pipeline = make_pipeline(DictVectorizer(sparse=False),
                                DecisionTreeClassifier(**params))
        pipeline.fit(train_x, train_y)

        prediction = pipeline.predict(test_x)
        prediction_eval = evaluate_model(test_y, prediction)   
        
        mlflow.log_metrics(prediction_eval)
        # mlflow.sklearn.log_model(pipeline, artifact_path="models_mlflow")
        
    return {"loss": -prediction_eval['f1_score'], 'status': STATUS_OK}

space = {"max_depth": hp.randint("max_depth", 1, 15),
        'min_samples_split': hp.randint("min_samples_split", 2, 15),
        'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
        "criterion": hp.choice("criterion", ["gini", "entropy"]),
        }

best_result = fmin(fn= objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials()
                    )

100%|██████████| 50/50 [00:11<00:00,  4.28trial/s, best loss: -0.6195165622202327]


In [15]:
def random_forest_objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'RandonForest')
        mlflow.log_params(params)

        pipeline = make_pipeline(DictVectorizer(sparse=False),
                                RandomForestClassifier(**params))
        pipeline.fit(train_x, train_y)

        prediction = pipeline.predict(test_x)
        prediction_eval = evaluate_model(test_y, prediction) 

        mlflow.log_metrics(prediction_eval)
        # mlflow.sklearn.log_model(pipeline, artifact_path="models_mlflow")
        
    return {"loss": -prediction_eval['f1_score'], 'status': STATUS_OK}


space = {"n_estimators": hp.choice("n_estimators", [2,5,10, 20, 30, 50, 100,]),
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 5)),
            #'min_samples_split': hp.randint("min_samples_split", 2, 15),
           # 'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            }

best_result = fmin(fn=random_forest_objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials())

100%|██████████| 50/50 [00:24<00:00,  2.05trial/s, best loss: -0.591715976331361] 


In [16]:
class XGBoostTrainer():
    def __init__(self, params, num_boost_round=1000, early_stopping_rounds=50):
        self.params = params
        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds
        self.booster = None
        self.dict_vectorizer = DictVectorizer(sparse=False)

    def fit(self, X, y):
        # Assuming X, y are your training data and labels
        # Convert the input features to a sparse matrix using DictVectorizer
        
        X_sparse = self.dict_vectorizer.fit_transform(X)

        # Create xgb.DMatrix
        dtrain = xgb.DMatrix(X_sparse, label=y)

        # Train the XGBoost model
        self.booster = xgb.train(self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_boost_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=[(dtrain, 'train')],
                                 verbose_eval=50)
        # mlflow.xgboost.log_model(self.booster, artifact_path='models_mlflow')

        return self

    def predict(self, X):
        # Convert the input features to a sparse matrix using DictVectorizer
        X_sparse = self.dict_vectorizer.transform(X)

        # Create xgb.DMatrix
        dmatrix = xgb.DMatrix(X_sparse)

        # Use the trained model for predictions
        predictions = self.booster.predict(dmatrix)
        return predictions

In [18]:
def objective(params):

    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag("model", "Xgboost")
        
        mlflow.log_params(params)
        
        booster =  make_pipeline(XGBoostTrainer(params = params))
        
        booster.fit(train_x, train_y)
        prediction = booster.predict(test_x)
        prediction = (prediction >= 0.5).astype('int')
        
        prediction_eval = evaluate_model(test_y, prediction)  
        mlflow.log_metrics(prediction_eval)
        

    return {'loss': -prediction_eval['f1_score'], 'status': STATUS_OK}

search_space = {
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 3)),
            'learning_rate': hp.loguniform('learning_rate', -3, 0),
            'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
            'objective': 'binary:logistic',  
            'eval_metric': 'logloss',                                             
            'seed': 42
                }

best_result = fmin(
                fn= objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=Trials()
                )

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

[0]	train-logloss:0.55770                             
[50]	train-logloss:0.26355                            
[100]	train-logloss:0.20673                           
[150]	train-logloss:0.18073                           
[200]	train-logloss:0.15936                           
[250]	train-logloss:0.14232                           
[300]	train-logloss:0.12862                           
[350]	train-logloss:0.12033                           
[400]	train-logloss:0.11297                           
[450]	train-logloss:0.10431                           
[500]	train-logloss:0.09842                           
[550]	train-logloss:0.09299                           
[600]	train-logloss:0.08888                           
[650]	train-logloss:0.08411                           
[700]	train-logloss:0.08092                           
[750]	train-logloss:0.07710                           
[800]	train-logloss:0.07359                           
[850]	train-logloss:0.07133                           
[900]	trai

In [1]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
! echo $PWD

/home/godwin/Documents/Workflow/Customer-retention/ChurnGuard/notebooks


In [12]:
!pip install prefect==2.14.9 -q

In [3]:
MLFLOW_TRACKING_URI = "sqlite:///../../databases/mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

2024/07/04 08:07:45 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/07/04 08:07:45 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


CommandError: Can't locate revision identified by '5b0e9adcef9c'

In [9]:
runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.f1_score >0.595",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.f1_score ASC"]
)

In [10]:
for run in runs:
    print(f"run id: {run.info.run_id}, F1 Score: {run.data.metrics['f1_score']:.4f}")

In [11]:
for run in runs:
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name="Custormer-churn-models")

In [65]:
model_name = "Custormer-churn-models"
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 5, stage: None


In [70]:
model_version = 5
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1692104278613, current_stage='Staging', description=None, last_updated_timestamp=1692104474582, name='Custormer-churn-models', run_id='eb579b6644f94306b371e1dbfb884a44', run_link=None, source='/home/godwin/Documents/Workflow/Churn-Prediction-in-a-Telecom-Company/mlruns/1/eb579b6644f94306b371e1dbfb884a44/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [73]:
from datetime import datetime


new_stage = 'Staging'
version = 5
date = datetime.today().date() 

client.update_model_version(
    name=model_name,
    version=version,
    description=f"The model version {version}  was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1692104278613, current_stage='Staging', description='The model version 5  was transitioned to Staging on 2023-08-15', last_updated_timestamp=1692104613905, name='Custormer-churn-models', run_id='eb579b6644f94306b371e1dbfb884a44', run_link=None, source='/home/godwin/Documents/Workflow/Churn-Prediction-in-a-Telecom-Company/mlruns/1/eb579b6644f94306b371e1dbfb884a44/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>