In [None]:
import mlflow
import pandas as pd
from matplotlib import pyplot as plt

from hyperopt.pyll import scope
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from sklearn.pipeline import make_pipeline

from sklearn.metrics import (precision_score, recall_score,
                             f1_score, accuracy_score)

import warnings
warnings.filterwarnings('ignore')

In [None]:
mlflow.set_tracking_uri('sqlite:///../../databases/mlflow.db')
mlflow.set_experiment('Customer_Churn_Predictions')

In [None]:
path = '../../data/churn-data/data/processed_data/churn.csv'
data = pd.read_csv(path)

y = data['churn']
X = data.drop(['churn'], axis=1)
X = X.to_dict(orient="record")


(train_x, test_x, 
    train_y, test_y) = train_test_split(X, y, test_size= 0.3, random_state=1993)

In [None]:
def evaluate_model(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1score = f1_score(y_true, y_pred)

    out = {"accuracy_score" : accuracy, 
            "precision_score" :precision, 
            "recall_score" : recall, 
            "f1_score" : f1score}
    return out

In [None]:
# Linear Model
c_values = range(1, 50, 5)
for val in c_values:

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'Logistic Regression')
        mlflow.log_param('C', val)

        lr_pipeline = make_pipeline(DictVectorizer(sparse= False),
                                    LogisticRegression(C =val))
        lr_pipeline.fit(train_x, train_y)

        test_pred = lr_pipeline.predict(test_x)
        test_output_eval = evaluate_model(test_y, test_pred)
        mlflow.log_metrics(test_output_eval)
        # mlflow.sklearn.log_model(lr_pipeline, artifact_path="models_mlflow")

In [None]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'DecisionTree')
        mlflow.log_params(params)

        pipeline = make_pipeline(DictVectorizer(sparse=False),
                                DecisionTreeClassifier(**params))
        pipeline.fit(train_x, train_y)

        prediction = pipeline.predict(test_x)
        prediction_eval = evaluate_model(test_y, prediction)   
        
        mlflow.log_metrics(prediction_eval)
        # mlflow.sklearn.log_model(pipeline, artifact_path="models_mlflow")
        
    return {"loss": -prediction_eval['f1_score'], 'status': STATUS_OK}

space = {"max_depth": hp.randint("max_depth", 1, 15),
        'min_samples_split': hp.randint("min_samples_split", 2, 15),
        'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
        "criterion": hp.choice("criterion", ["gini", "entropy"]),
        }

best_result = fmin(fn= objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials()
                    )

In [None]:
def random_forest_objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag('model', 'RandonForest')
        mlflow.log_params(params)

        pipeline = make_pipeline(DictVectorizer(sparse=False),
                                RandomForestClassifier(**params))
        pipeline.fit(train_x, train_y)

        prediction = pipeline.predict(test_x)
        prediction_eval = evaluate_model(test_y, prediction) 

        mlflow.log_metrics(prediction_eval)
        # mlflow.sklearn.log_model(pipeline, artifact_path="models_mlflow")
        
    return {"loss": -prediction_eval['f1_score'], 'status': STATUS_OK}


space = {"n_estimators": hp.choice("n_estimators", [2,5,10, 20, 30, 50, 100,]),
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 5)),
            #'min_samples_split': hp.randint("min_samples_split", 2, 15),
           # 'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            }

best_result = fmin(fn=random_forest_objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials())

In [None]:
class XGBoostTrainer():
    def __init__(self, params, num_boost_round=1000, early_stopping_rounds=50):
        self.params = params
        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds
        self.booster = None
        self.dict_vectorizer = DictVectorizer(sparse=False)

    def fit(self, X, y):
        # Assuming X, y are your training data and labels
        # Convert the input features to a sparse matrix using DictVectorizer
        
        X_sparse = self.dict_vectorizer.fit_transform(X)

        # Create xgb.DMatrix
        dtrain = xgb.DMatrix(X_sparse, label=y)

        # Train the XGBoost model
        self.booster = xgb.train(self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_boost_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=[(dtrain, 'train')],
                                 verbose_eval=50)
        # mlflow.xgboost.log_model(self.booster, artifact_path='models_mlflow')

        return self

    def predict(self, X):
        # Convert the input features to a sparse matrix using DictVectorizer
        X_sparse = self.dict_vectorizer.transform(X)

        # Create xgb.DMatrix
        dmatrix = xgb.DMatrix(X_sparse)

        # Use the trained model for predictions
        predictions = self.booster.predict(dmatrix)
        return predictions

In [None]:
def objective(params):

    with mlflow.start_run():

        mlflow.set_tag('Developer', 'Godwin')
        mlflow.set_tag("model", "Xgboost")
        
        mlflow.log_params(params)
        
        booster =  make_pipeline(XGBoostTrainer(params = params))
        
        booster.fit(train_x, train_y)
        prediction = booster.predict(test_x)
        prediction = (prediction >= 0.5).astype('int')
        
        prediction_eval = evaluate_model(test_y, prediction)  
        mlflow.log_metrics(prediction_eval)
        

    return {'loss': -prediction_eval['f1_score'], 'status': STATUS_OK}

search_space = {
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 3)),
            'learning_rate': hp.loguniform('learning_rate', -3, 0),
            'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
            'objective': 'binary:logistic',  
            'eval_metric': 'logloss',                                             
            'seed': 42
                }

best_result = fmin(
                fn= objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=Trials()
                )

In [None]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [None]:
! echo $PWD

In [None]:
!pip install prefect==2.14.9 -q

In [None]:
MLFLOW_TRACKING_URI = "sqlite:///../../databases/mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [None]:
runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.f1_score >0.595",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.f1_score ASC"]
)

In [None]:
for run in runs:
    print(f"run id: {run.info.run_id}, F1 Score: {run.data.metrics['f1_score']:.4f}")

In [None]:
for run in runs:
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name="Custormer-churn-models")

In [None]:
model_name = "Custormer-churn-models"
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

In [None]:
model_version = 5
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

In [None]:
from datetime import datetime


new_stage = 'Staging'
version = 5
date = datetime.today().date() 

client.update_model_version(
    name=model_name,
    version=version,
    description=f"The model version {version}  was transitioned to {new_stage} on {date}"
)