In [None]:
import yaml
import mlflow
import pandas as pd

from hyperopt.pyll import scope
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from sklearn.pipeline import make_pipeline

from sklearn.metrics import (precision_score, recall_score,
                             f1_score, accuracy_score)

import warnings
warnings.filterwarnings('ignore')

In [None]:
with open("../configs/params.yaml") as config:
    configurations = yaml.safe_load(config)
    
model_name = configurations["base"]['model']
seed = configurations["base"]['random_state']
developer = configurations['base']['developer']

In [None]:
mlflow.set_tracking_uri(configurations["tracking"]["tracking_url"])
mlflow.set_experiment(configurations["tracking"]["experiment_name"])

In [None]:
data = pd.read_csv(configurations['data']['data_path'])
data.head()

In [None]:
data.isnull().sum()

In [None]:
def process_training_data(data):

    dframe = data.copy()
    y = dframe['churn']
    X = dframe.drop(['churn'], axis=1)
    X = X.to_dict(orient="records")

    output_dframe = train_test_split(X, y, test_size= configurations['data']['test_size'], 
                                            random_state= seed)
    return output_dframe

In [None]:
(train_x, test_x, 
        train_y, test_y) = process_training_data(data)

In [None]:
def evaluate_model(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1score = f1_score(y_true, y_pred)

    out = {"accuracy_score" : accuracy, 
            "precision_score" :precision, 
            "recall_score" : recall, 
            "f1_score" : f1score}
    return out

In [None]:
# Linear Model

def linear_model():

    lr_params = configurations['hyperparameters']['linear_model']
    c_values = range(lr_params['min_c'], lr_params['max_c'], lr_params['interval'])
    artifact_path = f"{configurations['base']['artifact_path']}/models"

    for val in c_values:
    
        with mlflow.start_run():
            mlflow.set_tag('developer', developer)
            mlflow.set_tag('model_name', "linear Regression")
            mlflow.log_param('c', val)

            lr_pipeline = make_pipeline(DictVectorizer(sparse= False),
                                        LogisticRegression(C =val))
            lr_pipeline.fit(train_x, train_y)

            test_pred = lr_pipeline.predict(test_x)
            test_output_eval = evaluate_model(test_y, test_pred)
            mlflow.log_metrics(test_output_eval)
            mlflow.sklearn.log_model(lr_pipeline, artifact_path=artifact_path)
    print("Successfully Trained Linear Regression Models")

In [None]:
class Tree():
    def __init__(self, configurations):
        self.config = configurations
        
    def objective(self, params):


        model_name = params["model_name"]
        del params["model_name"]
        with mlflow.start_run():
            mlflow.set_tag('developer', developer)
            mlflow.set_tag('model_name', model_name)
            mlflow.log_params(params)

            if (model_name == "decisiontree"):
                pipeline = make_pipeline(DictVectorizer(sparse=False),
                                        DecisionTreeClassifier(**params))  
                              
            elif (model_name == "randomforest"):
                pipeline = make_pipeline(DictVectorizer(sparse=False),
                                RandomForestClassifier(**params))
                
            else:
                print(f"{model_name} does not exist in models")

            pipeline.fit(train_x, train_y)
            prediction = pipeline.predict(test_x)
            prediction_eval = evaluate_model(test_y, prediction)   
            
            mlflow.log_metrics(prediction_eval)
            # mlflow.sklearn.log_model(pipeline, artifact_path="models_mlflow")
            
        return {"loss": -prediction_eval['f1_score'], 'status': STATUS_OK}
    

    def inference(self, model_name):

        criterion = self.config['criterion']
        min_depth, max_depth = self.config['min_depth'], self.config['max_depth']
        min_samples_split, max_samples_split = self.config['min_sample_split'], self.config['max_sample_split']
        min_samples_leaf, max_sample_leaf = self.config['min_sample_leaf'], self.config['max_sample_leaf']

        space = {"max_depth": hp.randint("max_depth", min_depth, max_depth),
                'min_samples_split': hp.randint("min_samples_split", min_samples_split, max_samples_split),
                'min_samples_leaf': hp.randint("min_samples_leaf", min_samples_leaf, max_sample_leaf),
                "criterion": hp.choice("criterion", criterion),
                "model_name": model_name
                }

        best_result = fmin(fn= self.objective,
                            space=space,
                            algo=tpe.suggest,
                            max_evals=50,
                            trials=Trials()
                            )
        
        return best_result

In [None]:
class XGBoost():

    def __init__(self, params, num_boost_round=1000, early_stopping_rounds=50):
        self.params = params
        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds
        self.booster = None
        self.vectorizer = DictVectorizer(sparse=False)

    def fit(self, x, y):
        
        
        X_sparse = self.vectorizer.fit_transform(x)

        # Create xgb.DMatrix
        dtrain = xgb.DMatrix(X_sparse, label=y)
        self.booster = xgb.train(self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_boost_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=[(dtrain, 'train')],
                                 verbose_eval=50)
        # mlflow.xgboost.log_model(self.booster, artifact_path='models_mlflow')
            
    def objective(self, params):

        model_name = params["model_name"]
        del params["model_name"]
        with mlflow.start_run():

            mlflow.set_tag('Developer', developer)
            mlflow.set_tag("model", model_name)
            mlflow.log_params(params)
                        
            self.fit(train_x, train_y)
            prediction = self.predict(test_x)
            prediction = (prediction >= 0.5).astype('int')
            
            prediction_eval = evaluate_model(test_y, prediction)  
            mlflow.log_metrics(prediction_eval)
        return {'loss': -prediction_eval['f1_score'], 'status': STATUS_OK}

    def inference(self, model_name):

        objective = self.params['objective']
        metric = self.params["eval_metric"]
        min_learning_rate = self.params["min_learning_rate"]
        max_learning_rate = self.params["max_learning_rate"]
        min_depth, max_depth = self.params['min_depth'], self.params['max_depth']
        min_child_weight, max_child_weight = self.params['min_child_weight'], self.params['max_child_weight']
        
        search_space = {
                'max_depth': scope.int(hp.quniform('max_depth', min_depth, max_depth, 3)),
                'learning_rate': hp.loguniform('learning_rate', min_learning_rate, max_learning_rate),
                'min_child_weight': hp.loguniform('min_child_weight', min_child_weight, max_child_weight),
                'objective': objective,  
                'eval_metric': metric,                                             
                'seed': seed,
                "model_name": model_name
                    }

        best_result = fmin(fn= self.objective,
                            space=search_space,
                            algo=tpe.suggest,
                            max_evals=50,
                            trials=Trials()
                            )
        return best_result
    
        
    def predict(self, X):
        X_sparse = self.vectorizer.transform(X)

        # Create xgb.DMatrix
        dmatrix = xgb.DMatrix(X_sparse)

        # Use the trained model for predictions
        predictions = self.booster.predict(dmatrix)
        return predictions

In [None]:
params = configurations['hyperparameters']['tree_models']
models = configurations['base']['model']


tree = Tree(params)
result = tree.inference("randomforest")

In [None]:
params = configurations['hyperparameters']['xgboost']

xgboost = XGBoost(params)
result = xgboost.inference("xgboost")

## Model Registry

In [None]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import pandas as pd

MLFLOW_TRACKING_URI = "sqlite:///../databases/mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# Store results
run_data = []

def extract_top_5():

    # Get all experiments
    experiments = client.search_experiments()
    for experiment in experiments:
        experiment_id = experiment.experiment_id
        
        # Get runs for the current experiment
        runs = client.search_runs(
            experiment_ids=[experiment_id],
            filter_string="",
            run_view_type=ViewType.ACTIVE_ONLY
        )
        
        for run in runs:
            metrics = run.data.metrics
            inference_time = (run.info.end_time - run.info.start_time) / 1000
           # try:
            run_data.append({
                "run_id": run.info.run_id,  # This was missing in your second code block
                "experiment_id": experiment_id,
                "f1_score": metrics.get("f1_score", 0),
                "accuracy_score": metrics.get("accuracy_score", 0),
                "precision_score": metrics.get("precision_score", 0),
                "recall_score": metrics.get("recall_score", 0),
                "inference_time": inference_time,
                "params": run.data.params,
                "tags": run.data.tags.get("model_name", "unknown")
                })
            # except Exception as e:
            #     print(f"Error processing run {run.info.run_id}: {e}")
            #     continue

    # Convert to DataFrame
    df = pd.DataFrame(run_data)
    print(df.shape)

    if not df.empty:
        # Sort by f1_score (descending) and inference_time (ascending)
        df_sorted = df.sort_values(by=["f1_score", "inference_time"], ascending=[False, True])
        
        # Get top 5 run IDs
        top_5_runs = df_sorted.head(5)["run_id"].tolist()
        
        # Get all run IDs
        all_run_ids = df["run_id"].tolist()
        
        # Runs to delete (not in top 5)
        runs_to_delete = set(all_run_ids) - set(top_5_runs)
        
        # Delete unwanted runs
        for run_id in runs_to_delete:
            client.delete_run(run_id)
        
        print(f"Deleted {len(runs_to_delete)} runs, keeping only the top 5.")
    else:
        print("No runs found.")
    return df

df = extract_top_5()

In [None]:
def register_model(modelname, run_id):

    # Register the logged model in the Model Registry
    model_uri = f"runs:/{run_id}/models_mlflow"
    registered_model = client.create_registered_model(name=modelname)
    client.create_model_version(name=modelname, source=model_uri, run_id=run_id)
    print(f"Model registered successfully!")

def model_transition(modelname, modelid, currentstage=None, newstage=None, modelversion=None):
    """
    Transitions a model to a new stage. If no model is in production, it tags the latest version as 'Production' directly.
    
    :param modelname: Name of the registered model.
    :param currentstage: The current stage of the model (if applicable).
    :param newstage: The target stage to transition to.
    :param modelversion: The version of the model to transition (if applicable).
    """
    
    # Check if any model is already in Production
    try:
        production_models = client.get_model_version_by_alias(modelname, "production")
    except:
        production_models =  None 
 
    # If no model is in Production, move the latest model to Production

    if not production_models:
        latest_version = client.get_latest_versions(modelname)[0].version
        
        # Add a tag to indicate this version is now in Production
        client.set_tag(modelid, "version", latest_version)       
        client.set_registered_model_alias(modelname, "Production", latest_version)
        
        print(f"Model version {latest_version} transitioned directly to Production.")
        return
    
    # If a model is already in Production, transition the given version if specified
    if currentstage and newstage and modelversion:
        client.set_registered_model_alias(modelname, newstage, modelversion)
    
        print(f"Model version {modelversion} transitioned from {currentstage} to {newstage}.")



In [None]:
mlflow.set_registry_uri("sqlite:///../databases/mlflow.db")

In [None]:
def load_model(modelname, alias="Production"):
    # Load the model using the alias
    model_uri = f"models:/{modelname}@{alias}"
    model = mlflow.pyfunc.load_model(model_uri)
    return model