In [9]:
import numpy as np
import pandas as pd
from azureml.core import Workspace
import os
import io
from azure.storage.blob import BlobServiceClient
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
from sklearn.metrics import f1_score

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
def save_dataframe_to_blob(dataframe, container_name, blob_name):
    # Get connection string from environment variables
    connection_string = os.getenv('connection_string')
    if not connection_string:
        raise ValueError("connection_string is not set in the .env file")
    # Initialize BlobServiceClient
    try:
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        print("Successfully connected to Azure Blob Storage.")
    except ValueError as e:
        print(f"Error initializing BlobServiceClient: {e}")
        raise
    # Ensure the container exists
    try:
        container_client = blob_service_client.get_container_client(container_name)
        if not container_client.exists():
            container_client.create_container()
            print(f"Created container: {container_name}")
        else:
            print(f"Container {container_name} already exists.")
    except Exception as e:
        print(f"Error creating/getting container client: {e}")
        raise
    # Convert dataframe to CSV string
    csv_data = dataframe.to_csv(index=False)
    # Upload CSV string to blob storage
    try:
        blob_client = container_client.get_blob_client(blob_name)
        blob_client.upload_blob(csv_data, overwrite=True)
        print(f"Uploaded {blob_name} to blob storage in container {container_name}")
    except Exception as e:
        print(f"Error uploading blob: {e}")
        raise


def load_dataframe_from_blob(container_name, blob_name):
    """
    Loads a CSV file from Azure Blob Storage into a Pandas DataFrame.
    Args:
        container_name (str): The name of the Azure Blob Storage container.
        blob_name (str): The name of the blob to download.
    Returns:
        pandas.DataFrame: The loaded DataFrame.
    """
    # Get connection string from environment variables
    connection_string = os.getenv('connection_string')
    if not connection_string:
        raise ValueError("connection_string is not set in the .env file")
    # Initialize BlobServiceClient
    try:
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        print("Successfully connected to Azure Blob Storage.")
    except ValueError as e:
        print(f"Error initializing BlobServiceClient: {e}")
        raise
    # Get blob client
    try:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    except Exception as e:
        print(f"Error getting blob client: {e}")
        raise
    # Download blob content to a byte stream
    download_stream = blob_client.download_blob()
    blob_data = download_stream.readall()
    # Create a Pandas DataFrame from the byte stream
    df = pd.read_csv(io.BytesIO(blob_data))
    return df

In [4]:
connection_string = os.getenv('connection_string')
container_name = os.getenv('container_name')
blob_name = os.getenv('train_blob_name')

In [5]:
df_train_processed = load_dataframe_from_blob('processed-files','processed_train_df.csv')
print(df_train_processed.shape)
print('')
df_train_processed.head()

Successfully connected to Azure Blob Storage.
(455401, 11)



Unnamed: 0,UserID,basket_icon_click,basket_add_list,basket_add_detail,image_picker,list_size_dropdown,closed_minibasket_click,sign_in,saw_checkout,saw_homepage,ordered
0,a720-6b732349-a720-4862-bd21-644732,0,0,0,0,0,0,0,0,0,0
1,a0c0-6b73247c-a0c0-4bd9-8baa-797356,0,0,0,0,0,0,0,0,0,0
2,86a8-6b735c67-86a8-407b-ba24-333055,0,0,0,0,0,0,0,0,0,0
3,6a3d-6b736346-6a3d-4085-934b-396834,0,0,0,0,0,0,0,0,0,0
4,b74a-6b737717-b74a-45c3-8c6a-421140,0,1,0,0,1,0,1,1,1,1


In [5]:
df_train_processed["ordered"].value_counts

<bound method IndexOpsMixin.value_counts of 0         0
1         0
2         0
3         0
4         1
         ..
455396    0
455397    0
455398    0
455399    0
455400    0
Name: ordered, Length: 455401, dtype: int64>

In [6]:
X = df_train_processed.drop(columns=['UserID','ordered'])
y = df_train_processed['ordered']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [7]:
# Check class distribution in training and validation sets
def check_class_distribution(y_train, y_val):
    unique_train, counts_train = np.unique(y_train, return_counts=True)
    unique_val, counts_val = np.unique(y_val, return_counts=True)
    print("Training set class distribution:", dict(zip(unique_train, counts_train)))
    print("Validation set class distribution:", dict(zip(unique_val, counts_val)))

check_class_distribution(y_train, y_val)

Training set class distribution: {0: 349046, 1: 15274}
Validation set class distribution: {0: 87262, 1: 3819}


### Vanilla Models

In [8]:
# Defining the function to apply models
def apply_model(model, X_train, y_train, X_val, y_val, drop_id_col_list):
    # Fit the model
    model.fit(X_train.drop(drop_id_col_list, axis=1, errors='ignore'), y_train)

    # Make predictions
    y_train_pred = model.predict(X_train.drop(drop_id_col_list, axis=1, errors='ignore'))
    y_pred = model.predict(X_val.drop(drop_id_col_list, axis=1, errors='ignore'))

    # Calculate performance metrics
    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_val = accuracy_score(y_val, y_pred)
    f1_train = f1_score(y_train,y_train_pred)
    f1_val = f1_score(y_val, y_pred)
    return accuracy_train, accuracy_val,f1_train,f1_val


In [10]:
# Defining the models
vanila_models = [
    ("Logistic Regression", LogisticRegression(random_state=321)),
    ("Decision Tree", DecisionTreeClassifier(random_state=321)),
    ("Random Forest", RandomForestClassifier(random_state=321)),
    ("XGB Classifier", XGBClassifier(random_state=321))
]

In [11]:
#id col not to be considered while training
drop_id_col_list = ['UserID','ordered']

# Applying the models and storing the results
results_model_name = []
results_accuracy_val = []
results_f1_score_val = []
results_accuracy_train = []
results_f1_score_train = []

for name, model in vanila_models:
    accuracy_train, accuracy_val,f1_train,f1_val = apply_model(model, X_train, y_train, X_val, y_val,drop_id_col_list)
    results_model_name.append(name)
    results_accuracy_train.append(accuracy_train)
    results_accuracy_val.append(accuracy_val)
    results_f1_score_train.append(f1_train)
    results_f1_score_val.append(f1_val)





In [12]:
results_df = pd.DataFrame(columns=['Model_Name',
                                   'Accuracy_Train','Accuracy_Val',
                                   'F1_Score_Train','F1_Score_Val'])

results_df['Model_Name'] = results_model_name
results_df['Accuracy_Train'] = results_accuracy_train
results_df['Accuracy_Val'] = results_accuracy_val
results_df['F1_Score_Train'] = results_f1_score_train
results_df['F1_Score_Val'] = results_f1_score_val

print(results_df.to_string())

            Model_Name  Accuracy_Train  Accuracy_Val  F1_Score_Train  F1_Score_Val
0  Logistic Regression        0.973210      0.973167        0.681960      0.679853
1        Decision Tree        0.975220      0.975253        0.738576      0.736805
2        Random Forest        0.975220      0.975297        0.738698      0.737395
3       XGB Classifier        0.975217      0.975330        0.738963      0.738325


In [13]:
# models = {
#     "XGBoost": XGBClassifier(objective='binary:logistic', random_state=42),
#     "RandomForest": RandomForestClassifier(random_state=42),
#     "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
# }

# # Evaluate each model and log results with MLflow
# for model_name, model in models.items():
#     with mlflow.start_run(run_name=model_name):
#         # Train the model
#         model.fit(X_train, y_train)
        
#         # Make predictions
#         y_train_pred = model.predict(X_train)
#         y_val_pred = model.predict(X_val)
        
#         # Training metrics
#         train_accuracy = accuracy_score(y_train, y_train_pred)
#         train_precision = precision_score(y_train, y_train_pred)
#         train_recall = recall_score(y_train, y_train_pred)
#         train_f1 = f1_score(y_train, y_train_pred)

#         # Validation metrics
#         val_accuracy = accuracy_score(y_val, y_val_pred)
#         val_precision = precision_score(y_val, y_val_pred)
#         val_recall = recall_score(y_val, y_val_pred)
#         val_f1 = f1_score(y_val, y_val_pred)
#         report = classification_report(y_val, y_val_pred)
        
#         # Log model, parameters, and metrics
#         mlflow.sklearn.log_model(model, "model")
#         mlflow.log_params(model.get_params())
        
#         # Log training metrics
#         mlflow.log_metric("train_accuracy", train_accuracy)
#         mlflow.log_metric("train_precision", train_precision)
#         mlflow.log_metric("train_recall", train_recall)
#         mlflow.log_metric("train_f1_score", train_f1)
        
#         # Log validation metrics
#         mlflow.log_metric("val_accuracy", val_accuracy)
#         mlflow.log_metric("val_precision", val_precision)
#         mlflow.log_metric("val_recall", val_recall)
#         mlflow.log_metric("val_f1_score", val_f1)
        
#         # Log the classification report as an artifact
#         report_path = "classification_report.txt"
#         with open(report_path, "w") as f:
#             f.write(report)
#         mlflow.log_artifact(report_path)
    
#         # Print the evaluation report
#         print(f"Model: {model_name}")
#         print(f"Train Accuracy: {train_accuracy}")
#         print(f"Train Precision: {train_precision}")
#         print(f"Train Recall: {train_recall}")
#         print(f"Train F1 Score: {train_f1}")
#         print(f"Validation Accuracy: {val_accuracy}")
#         print(f"Validation Precision: {val_precision}")
#         print(f"Validation Recall: {val_recall}")
#         print(f"Validation F1 Score: {val_f1}")
#         print(report)
#         print("="*80)

In [14]:
# def objective(params):
#     with mlflow.start_run(nested=True):
#         model = XGBClassifier(**params, objective='binary:logistic', random_state=42, use_label_encoder=False)
#         model.fit(X_train, y_train)
        
#         # Predictions
#         y_train_pred = model.predict(X_train)
#         y_val_pred = model.predict(X_val)

#         # Training metrics
#         train_accuracy = accuracy_score(y_train, y_train_pred)
#         train_precision = precision_score(y_train, y_train_pred)
#         train_recall = recall_score(y_train, y_train_pred)
#         train_f1 = f1_score(y_train, y_train_pred)

#         # Validation metrics
#         val_accuracy = accuracy_score(y_val, y_val_pred)
#         val_precision = precision_score(y_val, y_val_pred)
#         val_recall = recall_score(y_val, y_val_pred)
#         val_f1 = f1_score(y_val, y_val_pred)
#         report = classification_report(y_val, y_val_pred)
        
#         # Log model, parameters, and metrics
#         mlflow.sklearn.log_model(model, "model")
#         mlflow.log_params(params)
        
#         # Log training metrics
#         mlflow.log_metric("train_accuracy", train_accuracy)
#         mlflow.log_metric("train_precision", train_precision)
#         mlflow.log_metric("train_recall", train_recall)
#         mlflow.log_metric("train_f1_score", train_f1)

#         # Log validation metrics
#         mlflow.log_metric("val_accuracy", val_accuracy)
#         mlflow.log_metric("val_precision", val_precision)
#         mlflow.log_metric("val_recall", val_recall)
#         mlflow.log_metric("val_f1_score", val_f1)
        
#         # Log the classification report as an artifact
#         report_path = "classification_report.txt"
#         with open(report_path, "w") as f:
#             f.write(report)
#         mlflow.log_artifact(report_path)

#         return {'loss': -val_f1, 'status': STATUS_OK, 'val_accuracy': val_accuracy, 'val_precision': val_precision, 'val_recall': val_recall, 'val_f1': val_f1}

# # Define the search space
# space = {
#     'n_estimators': hp.choice('n_estimators', range(50, 500)),
#     'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
#     'max_depth': hp.choice('max_depth', range(3, 15)),
#     'min_child_weight': hp.choice('min_child_weight', range(1, 10)),
#     'subsample': hp.uniform('subsample', 0.5, 1.0),
#     'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
# }

# # Run the optimization
# trials = Trials()
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=50,
#             trials=trials)

# # Convert hyperopt results to real parameter values
# best_params = {
#     'n_estimators': best['n_estimators'],
#     'learning_rate': best['learning_rate'],
#     'max_depth': best['max_depth'] + 3,  # adding the minimum value of range
#     'min_child_weight': best['min_child_weight'] + 1,
#     'subsample': best['subsample'],
#     'colsample_bytree': best['colsample_bytree']
# }

# print("Best parameters found: ", best_params)

In [21]:
def evaluate_model(model, X_train, y_train, X_val, y_val, drop_id_col_list):
   model.fit(X_train.drop(drop_id_col_list, axis=1, errors='ignore'), y_train)
   y_pred_train = model.predict(X_train.drop(drop_id_col_list, axis=1, errors='ignore'))
   y_pred_val = model.predict(X_val.drop(drop_id_col_list, axis=1, errors='ignore'))

   f1_train = f1_score(y_train, y_pred_train)
   accuracy_train = accuracy_score(y_train, y_pred_train)

   f1_val = f1_score(y_val, y_pred_val)
   accuracy_val = accuracy_score(y_val, y_pred_val)

   print("Training Set:")
   print("F1-score:", f1_train)
   print("Accuracy:", accuracy_train)
   print("\nValidation Set:")
   print("F1-score:", f1_val)
   print("Accuracy:", accuracy_val)

### Experiment Tracking

In [15]:
# Experiment name
experiment_name = "Azure_Propensity_Model"
mlflow.set_experiment(experiment_name)

2024/08/10 12:42:04 INFO mlflow.tracking.fluent: Experiment with name 'Azure_Propensity_Model' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1723293725726, experiment_id='e50b013a-522b-4df2-b552-1787132ac6e8', last_update_time=None, lifecycle_stage='active', name='Azure_Propensity_Model', tags={}>

**Experiment 1: Logistic Regression**
- Set Search Space
- Write Objective function
- Train LR and fetch best Params
- fit the final model
- Evaluate Results
- Log Model to MLflow

In [35]:
# Define hyperparameter spaces for each classifier
space_lr = {
    'C': hp.loguniform('C', np.log(0.01), np.log(10)),
    'max_iter': hp.uniform('max_iter', 1,5000)
}

In [36]:
from sklearn.metrics import f1_score

def train_evaluate(params):
    # Define columns to drop from the dataset
    drop_id_col_list = ['UserID','ordered']

    # Create a LogisticRegression model with the given parameters
    model = LogisticRegression(**params, random_state=321)
    
    # Train the model
    model.fit(X_train.drop(drop_id_col_list, axis=1, errors='ignore'), y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_val.drop(drop_id_col_list, axis=1, errors='ignore'))
    score = f1_score(y_val, y_pred)
    
    return -score  # Minimize the negative of F1-score


In [37]:
# Initialize Trials for hyperparameter optimization
trials = Trials()

# Use Hyperopt to search for the best hyperparameters
best_LR_param = fmin(fn=train_evaluate, space=space_lr, algo=tpe.suggest, max_evals=10, trials=trials)

print("Best hyperparameters:", best_LR_param)


100%|██████████| 10/10 [00:11<00:00,  1.13s/trial, best loss: -0.6798532879224521]
Best hyperparameters: {'C': 2.3330541258369792, 'max_iter': 1470.762397734738}


In [38]:
model_lr = LogisticRegression(**best_LR_param)
model_lr.fit(X_train.drop(drop_id_col_list, axis=1, errors='ignore'), y_train)

In [41]:
evaluate_model(model_lr,X_train,y_train,X_val,y_val,drop_id_col_list)

Training Set:
F1-score: 0.68198110133594
Accuracy: 0.9732103645147123

Validation Set:
F1-score: 0.6798532879224521
Accuracy: 0.9731667416914614


In [42]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Start an MLflow run
mlflow.set_experiment('Azure_Propensity_Model')

with mlflow.start_run(run_name="LogisticReg_classifier_azure_mlops") as run:
    # Log the best hyperparameters
    mlflow.log_params(best_LR_param)
    
    # Train the final model with the best hyperparameters
    best_model = LogisticRegression(**best_LR_param, random_state=321)
    best_model.fit(X_train, y_train)
    
    # Evaluate the model on the test dataset
    y_pred = best_model.predict(X_val)
    
    # Log model performance metrics
    f1 = f1_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    y_pred_prob = best_model.predict_proba(X_val)[:, 1]
    auc_roc = roc_auc_score(y_val, y_pred_prob)

    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc_roc", auc_roc)
    
    # Log the shapes of training and testing data
    mlflow.log_param("train_shape", X_train.shape)
    mlflow.log_param("test_shape", X_val.shape)
    
    # Plot and log the ROC curve
    fpr, tpr, _ = roc_curve(y_val, y_pred_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"AUC-ROC (area = {auc_roc:.2f})")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.grid(True)
    
    # Save the plot as a temporary file
    plt_path = "roc_curve.png"
    plt.savefig(plt_path)
    
    # Log the ROC curve
    mlflow.log_artifact(plt_path)
    
    # Log the trained model
    mlflow.sklearn.log_model(best_model, "LogisticReg_classifier_azure_mlops")

    plt.close()

print("Model and metrics logged in MLflow successfully.")




Model and metrics logged in MLflow successfully.
