In [1]:
import numpy as np
import pandas as pd
from azureml.core import Workspace
import os
import io
from azure.storage.blob import BlobServiceClient
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
from sklearn.metrics import f1_score

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
def save_dataframe_to_blob(dataframe, container_name, blob_name):
    # Get connection string from environment variables
    connection_string = os.getenv('connection_string')
    if not connection_string:
        raise ValueError("connection_string is not set in the .env file")
    # Initialize BlobServiceClient
    try:
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        print("Successfully connected to Azure Blob Storage.")
    except ValueError as e:
        print(f"Error initializing BlobServiceClient: {e}")
        raise
    # Ensure the container exists
    try:
        container_client = blob_service_client.get_container_client(container_name)
        if not container_client.exists():
            container_client.create_container()
            print(f"Created container: {container_name}")
        else:
            print(f"Container {container_name} already exists.")
    except Exception as e:
        print(f"Error creating/getting container client: {e}")
        raise
    # Convert dataframe to CSV string
    csv_data = dataframe.to_csv(index=False)
    # Upload CSV string to blob storage
    try:
        blob_client = container_client.get_blob_client(blob_name)
        blob_client.upload_blob(csv_data, overwrite=True)
        print(f"Uploaded {blob_name} to blob storage in container {container_name}")
    except Exception as e:
        print(f"Error uploading blob: {e}")
        raise


def load_dataframe_from_blob(container_name, blob_name):
    """
    Loads a CSV file from Azure Blob Storage into a Pandas DataFrame.
    Args:
        container_name (str): The name of the Azure Blob Storage container.
        blob_name (str): The name of the blob to download.
    Returns:
        pandas.DataFrame: The loaded DataFrame.
    """
    # Get connection string from environment variables
    connection_string = os.getenv('connection_string')
    if not connection_string:
        raise ValueError("connection_string is not set in the .env file")
    # Initialize BlobServiceClient
    try:
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        print("Successfully connected to Azure Blob Storage.")
    except ValueError as e:
        print(f"Error initializing BlobServiceClient: {e}")
        raise
    # Get blob client
    try:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    except Exception as e:
        print(f"Error getting blob client: {e}")
        raise
    # Download blob content to a byte stream
    download_stream = blob_client.download_blob()
    blob_data = download_stream.readall()
    # Create a Pandas DataFrame from the byte stream
    df = pd.read_csv(io.BytesIO(blob_data))
    return df

In [3]:
connection_string = os.getenv('connection_string')
container_name = os.getenv('container_name')
blob_name = os.getenv('train_blob_name')

In [4]:
df_train_processed = load_dataframe_from_blob('processed-files','processed_train_df.csv')
print(df_train_processed.shape)
print('')
df_train_processed.head()

Successfully connected to Azure Blob Storage.
(455401, 11)



Unnamed: 0,UserID,basket_icon_click,basket_add_list,basket_add_detail,image_picker,list_size_dropdown,closed_minibasket_click,sign_in,saw_checkout,saw_homepage,ordered
0,a720-6b732349-a720-4862-bd21-644732,0,0,0,0,0,0,0,0,0,0
1,a0c0-6b73247c-a0c0-4bd9-8baa-797356,0,0,0,0,0,0,0,0,0,0
2,86a8-6b735c67-86a8-407b-ba24-333055,0,0,0,0,0,0,0,0,0,0
3,6a3d-6b736346-6a3d-4085-934b-396834,0,0,0,0,0,0,0,0,0,0
4,b74a-6b737717-b74a-45c3-8c6a-421140,0,1,0,0,1,0,1,1,1,1


In [5]:
df_train_processed["ordered"].value_counts

<bound method IndexOpsMixin.value_counts of 0         0
1         0
2         0
3         0
4         1
         ..
455396    0
455397    0
455398    0
455399    0
455400    0
Name: ordered, Length: 455401, dtype: int64>

In [6]:
X = df_train_processed.drop(columns=['UserID','ordered'])
y = df_train_processed['ordered']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)


In [7]:
# Check class distribution in training and validation sets
def check_class_distribution(y_train, y_val):
    unique_train, counts_train = np.unique(y_train, return_counts=True)
    unique_val, counts_val = np.unique(y_val, return_counts=True)
    print("Training set class distribution:", dict(zip(unique_train, counts_train)))
    print("Validation set class distribution:", dict(zip(unique_val, counts_val)))

check_class_distribution(y_train, y_val)

Training set class distribution: {0: 349046, 1: 15274}
Validation set class distribution: {0: 87262, 1: 3819}


In [8]:
models = {
    "XGBoost": XGBClassifier(objective='binary:logistic', random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}

# Evaluate each model and log results with MLflow
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        # Training metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_precision = precision_score(y_train, y_train_pred)
        train_recall = recall_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred)

        # Validation metrics
        val_accuracy = accuracy_score(y_val, y_val_pred)
        val_precision = precision_score(y_val, y_val_pred)
        val_recall = recall_score(y_val, y_val_pred)
        val_f1 = f1_score(y_val, y_val_pred)
        report = classification_report(y_val, y_val_pred)
        
        # Log model, parameters, and metrics
        mlflow.sklearn.log_model(model, "model")
        mlflow.log_params(model.get_params())
        
        # Log training metrics
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("train_precision", train_precision)
        mlflow.log_metric("train_recall", train_recall)
        mlflow.log_metric("train_f1_score", train_f1)
        
        # Log validation metrics
        mlflow.log_metric("val_accuracy", val_accuracy)
        mlflow.log_metric("val_precision", val_precision)
        mlflow.log_metric("val_recall", val_recall)
        mlflow.log_metric("val_f1_score", val_f1)
        
        # Log the classification report as an artifact
        report_path = "classification_report.txt"
        with open(report_path, "w") as f:
            f.write(report)
        mlflow.log_artifact(report_path)
    
        # Print the evaluation report
        print(f"Model: {model_name}")
        print(f"Train Accuracy: {train_accuracy}")
        print(f"Train Precision: {train_precision}")
        print(f"Train Recall: {train_recall}")
        print(f"Train F1 Score: {train_f1}")
        print(f"Validation Accuracy: {val_accuracy}")
        print(f"Validation Precision: {val_precision}")
        print(f"Validation Recall: {val_recall}")
        print(f"Validation F1 Score: {val_f1}")
        print(report)
        print("="*80)

Model: XGBoost
Train Accuracy: 0.9752140974967062
Train Precision: 0.6616444030237134
Train Recall: 0.8366505172188032
Train F1 Score: 0.738926795420377
Validation Accuracy: 0.97532965162877
Validation Precision: 0.6649181703734788
Validation Recall: 0.8297983765383609
Validation F1 Score: 0.7382644146767618
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     87262
           1       0.66      0.83      0.74      3819

    accuracy                           0.98     91081
   macro avg       0.83      0.91      0.86     91081
weighted avg       0.98      0.98      0.98     91081

Model: RandomForest
Train Accuracy: 0.9752195871761089
Train Precision: 0.6618637918523893
Train Recall: 0.8360612806075685
Train F1 Score: 0.7388336033325619
Validation Accuracy: 0.9753406308670305
Validation Precision: 0.6651270207852193
Validation Recall: 0.8295365278868814
Validation F1 Score: 0.738289443020275
              precision    recall  f1-score   



In [9]:
def objective(params):
    with mlflow.start_run(nested=True):
        model = XGBClassifier(**params, objective='binary:logistic', random_state=42, use_label_encoder=False)
        model.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # Training metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_precision = precision_score(y_train, y_train_pred)
        train_recall = recall_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred)

        # Validation metrics
        val_accuracy = accuracy_score(y_val, y_val_pred)
        val_precision = precision_score(y_val, y_val_pred)
        val_recall = recall_score(y_val, y_val_pred)
        val_f1 = f1_score(y_val, y_val_pred)
        report = classification_report(y_val, y_val_pred)
        
        # Log model, parameters, and metrics
        mlflow.sklearn.log_model(model, "model")
        mlflow.log_params(params)
        
        # Log training metrics
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("train_precision", train_precision)
        mlflow.log_metric("train_recall", train_recall)
        mlflow.log_metric("train_f1_score", train_f1)

        # Log validation metrics
        mlflow.log_metric("val_accuracy", val_accuracy)
        mlflow.log_metric("val_precision", val_precision)
        mlflow.log_metric("val_recall", val_recall)
        mlflow.log_metric("val_f1_score", val_f1)
        
        # Log the classification report as an artifact
        report_path = "classification_report.txt"
        with open(report_path, "w") as f:
            f.write(report)
        mlflow.log_artifact(report_path)

        return {'loss': -val_f1, 'status': STATUS_OK, 'val_accuracy': val_accuracy, 'val_precision': val_precision, 'val_recall': val_recall, 'val_f1': val_f1}

# Define the search space
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 500)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', range(3, 15)),
    'min_child_weight': hp.choice('min_child_weight', range(1, 10)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}

# Run the optimization
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

# Convert hyperopt results to real parameter values
best_params = {
    'n_estimators': best['n_estimators'],
    'learning_rate': best['learning_rate'],
    'max_depth': best['max_depth'] + 3,  # adding the minimum value of range
    'min_child_weight': best['min_child_weight'] + 1,
    'subsample': best['subsample'],
    'colsample_bytree': best['colsample_bytree']
}

print("Best parameters found: ", best_params)

100%|██████████| 50/50 [14:58<00:00, 17.97s/trial, best loss: -0.7418724870763929]
Best parameters found:  {'n_estimators': 9, 'learning_rate': 0.28216411908964284, 'max_depth': 12, 'min_child_weight': 4, 'subsample': 0.5001641583202635, 'colsample_bytree': 0.8169180452710694}


Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_encoder" } are not used.




Parameters: { "use_label_enc