In [67]:
import pandas as pd
import numpy as np

import mlflow
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe
from hyperopt.pyll import scope

from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, KFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('Telcom Churn')
mlflow.sklearn.autolog(True)

2023/07/01 10:07:52 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/07/01 10:07:52 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


In [21]:
def load_data(path):
    data = pd.read_csv(path)
    data.columns = data.columns.str.replace(' ', '_').str.lower()

    categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()
    for col in categorical_col:
        data[col] = data[col].str.replace(' ', '_').str.lower()

    data = data[data['totalcharges'] != '_']
    data['totalcharges'] = data['totalcharges'].astype('float32')
    return data

In [22]:
def prepare_data(data):
    data['churn'] = (data.churn=='yes').astype(int)
    categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()
    numerical_col = ['tenure', 'totalcharges', 'monthlycharges']

    categorical_col.remove('customerid')
    train_data, test_data = train_test_split(data, test_size=0.25,
                                            random_state=0)

    train_x = train_data.drop(['churn'], axis = 1)
    test_x = test_data.drop(['churn'], axis = 1)

    train_y = train_data.pop('churn')
    test_y = test_data.pop('churn')

    dv = DictVectorizer(sparse = False)
    dv.fit(train_x[categorical_col + numerical_col].to_dict(orient = 'records'))

    train_x = dv.transform(train_x[categorical_col + numerical_col].to_dict(orient = 'records'))
    test_x = dv.transform(test_x[categorical_col + numerical_col].to_dict(orient = 'records'))
    out = (train_x, train_y, test_x, test_y, dv)
    return out

In [29]:
def log_evaluation(y_true, y_pred):

    accuracy_ = accuracy_score(y_true, y_pred)
    precision_ = precision_score(y_true, y_pred)
    recall_ = recall_score(y_true, y_pred)
    f1score_ = f1_score(y_true, y_pred)

    out = {"test_accuracy_score" : accuracy_, 
    "test_precision_score" :precision_, 
    "test_recall_score" : recall_, 
    "test_f1_score" : f1score_}
    return out

def mlflow_logging(model, model_name, params, eval, 
                   model_tag = None, developer = 'Godwin'):

    train_output_eval, test_output_eval = eval
    with mlflow.start_run():
            
        mlflow.set_tag('Developer', developer)
        mlflow.set_tag('Model Type', model_tag)
        mlflow.set_tag('Model Name', model_name)
        mlflow.log_params(params)
        
        mlflow.log_metrics(train_output_eval)
        mlflow.log_metrics(test_output_eval)
        mlflow.sklearn.log_model(model)
        
        mlflow.log_artifact('new_data.csv', 'data.csv')

In [35]:
path = './data/Telco-Customer-Churn.csv'
data = load_data(path)
train_x, train_y, test_x, \
        test_y, dv = prepare_data(data)

# Linear Model
c_values = range(1, 100, 10)
for c_value in c_values:

    with mlflow.start_run():
    
        mlflow.set_tag('Developer', 'Godwin')

        model = LogisticRegression(C = c_value)
        model.fit(train_x, train_y)

        test_pred = model.predict(test_x)
        test_output_eval = log_evaluation(test_y, test_pred)
        mlflow.log_metrics(test_output_eval)



In [39]:
def single_tree_objective(params):
    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')

        model = DecisionTreeClassifier(**params)
        model.fit(train_x, train_y)

        test_pred = model.predict(test_x)
        test_output_eval = log_evaluation(test_y, test_pred)   
        
        mlflow.log_metrics(test_output_eval)
        
    return {"loss": -test_output_eval['test_accuracy_score'], 'status': STATUS_OK}

def random_forest_objective(params):
    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')

        model = RandomForestClassifier(**params)
        model.fit(train_x, train_y)

        test_pred = model.predict(test_x)
        test_output_eval = log_evaluation(test_y, test_pred)   
        mlflow.log_metrics(test_output_eval)
        
    return {"loss": -test_output_eval['test_accuracy_score'], 'status': STATUS_OK}

def single_tree():

    space = {"max_depth": hp.randint("max_depth", 1, 15),
            'min_samples_split': hp.randint("min_samples_split", 2, 15),
            'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            }

    best_result = fmin(fn= single_tree_objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=50,
                        trials=Trials()
                        )
    return best_result

def random_forest(): 

    space = {"n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
             'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
             'min_samples_split': hp.randint("min_samples_split", 2, 15),
             'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
             "criterion": hp.choice("criterion", ["gini", "entropy"]),
             }

    best_result = fmin(fn=random_forest_objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=50,
                        trials=Trials()
                        )
    return best_result


In [37]:
single_tree()

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




  2%|▏         | 1/50 [00:03<02:51,  3.50s/trial, best loss: -0.7792946530147895]




  4%|▍         | 2/50 [00:07<03:03,  3.82s/trial, best loss: -0.7792946530147895]




  6%|▌         | 3/50 [00:10<02:46,  3.54s/trial, best loss: -0.7792946530147895]




  8%|▊         | 4/50 [00:14<02:43,  3.56s/trial, best loss: -0.7872582480091013]




 10%|█         | 5/50 [00:18<02:42,  3.61s/trial, best loss: -0.7872582480091013]




 12%|█▏        | 6/50 [00:21<02:42,  3.70s/trial, best loss: -0.7940841865756542]




 14%|█▍        | 7/50 [00:25<02:41,  3.75s/trial, best loss: -0.7940841865756542]




 16%|█▌        | 8/50 [00:29<02:33,  3.65s/trial, best loss: -0.7940841865756542]




 18%|█▊        | 9/50 [00:33<02:32,  3.72s/trial, best loss: -0.7940841865756542]




 20%|██        | 10/50 [00:36<02:24,  3.62s/trial, best loss: -0.7940841865756542]




 22%|██▏       | 11/50 [00:40<02:22,  3.66s/trial, best loss: -0.7940841865756542]




 24%|██▍       | 12/50 [00:43<02:18,  3.65s/trial, best loss: -0.7940841865756542]




 26%|██▌       | 13/50 [00:48<02:21,  3.83s/trial, best loss: -0.7940841865756542]




 28%|██▊       | 14/50 [00:51<02:15,  3.77s/trial, best loss: -0.7940841865756542]




 30%|███       | 15/50 [00:55<02:08,  3.67s/trial, best loss: -0.7940841865756542]




 32%|███▏      | 16/50 [00:58<02:01,  3.56s/trial, best loss: -0.7940841865756542]




 34%|███▍      | 17/50 [01:01<01:54,  3.47s/trial, best loss: -0.7963594994311718]




 36%|███▌      | 18/50 [01:05<01:55,  3.61s/trial, best loss: -0.7963594994311718]




 38%|███▊      | 19/50 [01:09<01:49,  3.53s/trial, best loss: -0.7963594994311718]




 40%|████      | 20/50 [01:12<01:44,  3.47s/trial, best loss: -0.7963594994311718]




 42%|████▏     | 21/50 [01:16<01:44,  3.60s/trial, best loss: -0.7963594994311718]




 44%|████▍     | 22/50 [01:19<01:42,  3.64s/trial, best loss: -0.7963594994311718]




 46%|████▌     | 23/50 [01:23<01:35,  3.52s/trial, best loss: -0.7963594994311718]




 48%|████▊     | 24/50 [01:27<01:35,  3.68s/trial, best loss: -0.7963594994311718]




 50%|█████     | 25/50 [01:30<01:28,  3.55s/trial, best loss: -0.7963594994311718]




 52%|█████▏    | 26/50 [01:33<01:23,  3.48s/trial, best loss: -0.7963594994311718]




 54%|█████▍    | 27/50 [01:37<01:18,  3.41s/trial, best loss: -0.7963594994311718]




 56%|█████▌    | 28/50 [01:41<01:18,  3.57s/trial, best loss: -0.7963594994311718]




 58%|█████▊    | 29/50 [01:45<01:20,  3.82s/trial, best loss: -0.7963594994311718]




 60%|██████    | 30/50 [01:48<01:13,  3.66s/trial, best loss: -0.7963594994311718]




 62%|██████▏   | 31/50 [01:52<01:07,  3.57s/trial, best loss: -0.7963594994311718]




 64%|██████▍   | 32/50 [01:55<01:03,  3.54s/trial, best loss: -0.7963594994311718]




 66%|██████▌   | 33/50 [01:59<01:00,  3.53s/trial, best loss: -0.7963594994311718]




 68%|██████▊   | 34/50 [02:03<00:59,  3.72s/trial, best loss: -0.7963594994311718]




 70%|███████   | 35/50 [02:07<00:59,  3.95s/trial, best loss: -0.7963594994311718]




 72%|███████▏  | 36/50 [02:10<00:52,  3.75s/trial, best loss: -0.7974971558589306]




 74%|███████▍  | 37/50 [02:14<00:46,  3.57s/trial, best loss: -0.7974971558589306]




 76%|███████▌  | 38/50 [02:17<00:41,  3.48s/trial, best loss: -0.7974971558589306]




 78%|███████▊  | 39/50 [02:20<00:37,  3.43s/trial, best loss: -0.7974971558589306]




 80%|████████  | 40/50 [02:23<00:33,  3.38s/trial, best loss: -0.7974971558589306]




 82%|████████▏ | 41/50 [02:27<00:30,  3.35s/trial, best loss: -0.7974971558589306]




 84%|████████▍ | 42/50 [02:30<00:26,  3.32s/trial, best loss: -0.7986348122866894]




 86%|████████▌ | 43/50 [02:33<00:23,  3.29s/trial, best loss: -0.7986348122866894]




 88%|████████▊ | 44/50 [02:37<00:19,  3.30s/trial, best loss: -0.7986348122866894]




 90%|█████████ | 45/50 [02:40<00:16,  3.28s/trial, best loss: -0.7986348122866894]




 92%|█████████▏| 46/50 [02:43<00:13,  3.28s/trial, best loss: -0.7986348122866894]




 94%|█████████▍| 47/50 [02:46<00:09,  3.29s/trial, best loss: -0.7986348122866894]




 96%|█████████▌| 48/50 [02:50<00:06,  3.36s/trial, best loss: -0.7986348122866894]




 98%|█████████▊| 49/50 [02:54<00:03,  3.45s/trial, best loss: -0.7986348122866894]




100%|██████████| 50/50 [02:57<00:00,  3.55s/trial, best loss: -0.7986348122866894]


{'criterion': 1, 'max_depth': 7, 'min_samples_leaf': 9, 'min_samples_split': 5}

In [40]:
random_forest()

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




  2%|▏         | 1/50 [00:09<07:24,  9.06s/trial, best loss: -0.8065984072810012]




  4%|▍         | 2/50 [00:20<08:18, 10.39s/trial, best loss: -0.8100113765642776]




  6%|▌         | 3/50 [00:29<07:33,  9.65s/trial, best loss: -0.8100113765642776]




  8%|▊         | 4/50 [00:40<07:46, 10.14s/trial, best loss: -0.810580204778157] 




 10%|█         | 5/50 [00:48<07:15,  9.68s/trial, best loss: -0.810580204778157]




 12%|█▏        | 6/50 [01:00<07:27, 10.16s/trial, best loss: -0.810580204778157]




 14%|█▍        | 7/50 [01:06<06:22,  8.91s/trial, best loss: -0.810580204778157]




 16%|█▌        | 8/50 [01:16<06:27,  9.22s/trial, best loss: -0.810580204778157]




 18%|█▊        | 9/50 [01:27<06:44,  9.86s/trial, best loss: -0.810580204778157]




 20%|██        | 10/50 [01:40<07:07, 10.69s/trial, best loss: -0.810580204778157]




 22%|██▏       | 11/50 [01:47<06:15,  9.62s/trial, best loss: -0.810580204778157]




 24%|██▍       | 12/50 [01:53<05:32,  8.75s/trial, best loss: -0.810580204778157]




 26%|██▌       | 13/50 [02:03<05:33,  9.01s/trial, best loss: -0.810580204778157]




 28%|██▊       | 14/50 [02:09<04:52,  8.11s/trial, best loss: -0.810580204778157]




 30%|███       | 15/50 [02:14<04:05,  7.01s/trial, best loss: -0.810580204778157]




 32%|███▏      | 16/50 [02:21<04:06,  7.26s/trial, best loss: -0.810580204778157]




 34%|███▍      | 17/50 [02:31<04:22,  7.95s/trial, best loss: -0.810580204778157]




 36%|███▌      | 18/50 [02:42<04:44,  8.88s/trial, best loss: -0.810580204778157]




 38%|███▊      | 19/50 [02:47<03:57,  7.67s/trial, best loss: -0.8156996587030717]




 40%|████      | 20/50 [02:53<03:36,  7.23s/trial, best loss: -0.8156996587030717]




 42%|████▏     | 21/50 [02:58<03:05,  6.41s/trial, best loss: -0.8156996587030717]




 44%|████▍     | 22/50 [03:02<02:42,  5.82s/trial, best loss: -0.8156996587030717]




 46%|████▌     | 23/50 [03:14<03:27,  7.68s/trial, best loss: -0.8156996587030717]




 48%|████▊     | 24/50 [03:19<02:55,  6.73s/trial, best loss: -0.8156996587030717]




 50%|█████     | 25/50 [03:23<02:32,  6.10s/trial, best loss: -0.8156996587030717]




 52%|█████▏    | 26/50 [03:28<02:16,  5.70s/trial, best loss: -0.8156996587030717]




 54%|█████▍    | 27/50 [03:33<02:04,  5.43s/trial, best loss: -0.8156996587030717]




 56%|█████▌    | 28/50 [03:38<01:56,  5.28s/trial, best loss: -0.8156996587030717]




 58%|█████▊    | 29/50 [03:43<01:48,  5.16s/trial, best loss: -0.8156996587030717]




 60%|██████    | 30/50 [03:49<01:52,  5.61s/trial, best loss: -0.8156996587030717]




 62%|██████▏   | 31/50 [03:58<02:06,  6.65s/trial, best loss: -0.8156996587030717]




 64%|██████▍   | 32/50 [04:03<01:47,  5.99s/trial, best loss: -0.8156996587030717]




 66%|██████▌   | 33/50 [04:12<01:58,  6.96s/trial, best loss: -0.8156996587030717]




 68%|██████▊   | 34/50 [04:16<01:39,  6.19s/trial, best loss: -0.8156996587030717]




 70%|███████   | 35/50 [04:23<01:33,  6.23s/trial, best loss: -0.8156996587030717]




 72%|███████▏  | 36/50 [04:32<01:39,  7.13s/trial, best loss: -0.8156996587030717]




 74%|███████▍  | 37/50 [04:39<01:33,  7.18s/trial, best loss: -0.8156996587030717]




 76%|███████▌  | 38/50 [04:44<01:17,  6.48s/trial, best loss: -0.8156996587030717]




 78%|███████▊  | 39/50 [04:57<01:31,  8.32s/trial, best loss: -0.8156996587030717]




 80%|████████  | 40/50 [05:01<01:11,  7.20s/trial, best loss: -0.8156996587030717]




 82%|████████▏ | 41/50 [05:07<01:00,  6.77s/trial, best loss: -0.8156996587030717]




 84%|████████▍ | 42/50 [05:16<00:58,  7.30s/trial, best loss: -0.8156996587030717]




 86%|████████▌ | 43/50 [05:28<01:02,  8.89s/trial, best loss: -0.8156996587030717]




 88%|████████▊ | 44/50 [05:35<00:50,  8.41s/trial, best loss: -0.8156996587030717]




 90%|█████████ | 45/50 [05:46<00:45,  9.04s/trial, best loss: -0.8156996587030717]




 92%|█████████▏| 46/50 [05:51<00:31,  7.76s/trial, best loss: -0.8156996587030717]




 94%|█████████▍| 47/50 [05:55<00:20,  6.78s/trial, best loss: -0.8156996587030717]




 96%|█████████▌| 48/50 [06:03<00:13,  6.98s/trial, best loss: -0.8156996587030717]




 98%|█████████▊| 49/50 [06:08<00:06,  6.61s/trial, best loss: -0.8156996587030717]




100%|██████████| 50/50 [06:19<00:00,  7.58s/trial, best loss: -0.8156996587030717]


{'criterion': 1,
 'max_depth': 59.0,
 'min_samples_leaf': 5,
 'min_samples_split': 3,
 'n_estimators': 0}

In [42]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [53]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.test_f1_score >0.6",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.test_f1_score ASC"]
)

In [54]:
for run in runs:
    print(f"run id: {run.info.run_id}, F1 Score: {run.data.metrics['test_f1_score']:.4f}")

run id: b5eb75916ba045bc99b4902ea507a319, F1 Score: 0.6012


In [55]:
run_id = "b5eb75916ba045bc99b4902ea507a319"

model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="Custormer-churn-models")

model_name = "Custormer-churn-models"
latest_versions = client.get_latest_versions(name=model_name)

2023/07/01 11:26:21 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/07/01 11:26:21 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'Custormer-churn-models'.
2023/07/01 11:26:21 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Custormer-churn-models, version 1
Created version '1' of model 'Custormer-churn-models'.


In [57]:
for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


In [58]:
model_version = 1
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1688225181126, current_stage='Staging', description=None, last_updated_timestamp=1688226020087, name='Custormer-churn-models', run_id='b5eb75916ba045bc99b4902ea507a319', run_link=None, source='/home/godwin/Documents/Workflow/Churn-Prediction-in-a-Telecom-Company/mlruns/1/b5eb75916ba045bc99b4902ea507a319/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [59]:
from datetime import datetime

date = datetime.today().date() 

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1688225181126, current_stage='Staging', description='The model version 1 was transitioned to Staging on 2023-07-01', last_updated_timestamp=1688226074603, name='Custormer-churn-models', run_id='b5eb75916ba045bc99b4902ea507a319', run_link=None, source='/home/godwin/Documents/Workflow/Churn-Prediction-in-a-Telecom-Company/mlruns/1/b5eb75916ba045bc99b4902ea507a319/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df



def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [64]:
import mlflow
logged_model = 'runs:/b5eb75916ba045bc99b4902ea507a319/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# # Predict on a Pandas DataFrame.
# import pandas as pd
# loaded_model.predict(pd.DataFrame(data))

 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [63]:
loaded_model.predict(train_x)

array([0, 0, 1, ..., 0, 0, 0])

In [62]:
loaded_model

'runs:/b5eb75916ba045bc99b4902ea507a319/model'

In [65]:
client.delete_model_version(model_name, 1)

In [66]:
client.delete_registered_model(model_name)

In [None]:
train = xgb.DMatrix(train_x, label=train_y)
valid = xgb.DMatrix(test_x, label=test_y)

def xgboost_objective(params):
    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('Model Type', 'Base Model')
        mlflow.set_tag("model", "Xgboost")
        mlflow.log_params(params)
        model = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        train_pred = model.predict(train)
        train_output_eval = evaluation(train_y, train_pred, 'train')
        test_pred = model.predict(valid)
        test_output_eval = evaluation(test_y, test_pred, 'test')   
        data.to_csv('new_data.csv', header=False)    
        mlflow.log_metrics(train_output_eval)
        mlflow.log_metrics(test_output_eval)
        mlflow.log_artifact('new_data.csv', 'data.csv')
    return {'loss': test_output_eval['Test accuracy Score'], 'status': STATUS_OK}