In [1]:
import numpy as np
import pandas as pd
from azureml.core import Workspace
import os
import io
from azure.storage.blob import BlobServiceClient
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
from sklearn.metrics import f1_score

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
def save_dataframe_to_blob(dataframe, container_name, blob_name):
    # Get connection string from environment variables
    connection_string = os.getenv('connection_string')
    if not connection_string:
        raise ValueError("connection_string is not set in the .env file")
    # Initialize BlobServiceClient
    try:
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        print("Successfully connected to Azure Blob Storage.")
    except ValueError as e:
        print(f"Error initializing BlobServiceClient: {e}")
        raise
    # Ensure the container exists
    try:
        container_client = blob_service_client.get_container_client(container_name)
        if not container_client.exists():
            container_client.create_container()
            print(f"Created container: {container_name}")
        else:
            print(f"Container {container_name} already exists.")
    except Exception as e:
        print(f"Error creating/getting container client: {e}")
        raise
    # Convert dataframe to CSV string
    csv_data = dataframe.to_csv(index=False)
    # Upload CSV string to blob storage
    try:
        blob_client = container_client.get_blob_client(blob_name)
        blob_client.upload_blob(csv_data, overwrite=True)
        print(f"Uploaded {blob_name} to blob storage in container {container_name}")
    except Exception as e:
        print(f"Error uploading blob: {e}")
        raise


def load_dataframe_from_blob(container_name, blob_name):
    """
    Loads a CSV file from Azure Blob Storage into a Pandas DataFrame.
    Args:
        container_name (str): The name of the Azure Blob Storage container.
        blob_name (str): The name of the blob to download.
    Returns:
        pandas.DataFrame: The loaded DataFrame.
    """
    # Get connection string from environment variables
    connection_string = os.getenv('connection_string')
    if not connection_string:
        raise ValueError("connection_string is not set in the .env file")
    # Initialize BlobServiceClient
    try:
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        print("Successfully connected to Azure Blob Storage.")
    except ValueError as e:
        print(f"Error initializing BlobServiceClient: {e}")
        raise
    # Get blob client
    try:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    except Exception as e:
        print(f"Error getting blob client: {e}")
        raise
    # Download blob content to a byte stream
    download_stream = blob_client.download_blob()
    blob_data = download_stream.readall()
    # Create a Pandas DataFrame from the byte stream
    df = pd.read_csv(io.BytesIO(blob_data))
    return df

In [3]:
connection_string = os.getenv('connection_string')
container_name = os.getenv('container_name')
blob_name = os.getenv('train_blob_name')

In [4]:
df_train_processed = load_dataframe_from_blob('processed-files','processed_test_df.csv')
print(df_train_processed.shape)
print('')
df_train_processed.head()

Successfully connected to Azure Blob Storage.
(151655, 11)



Unnamed: 0,UserID,basket_icon_click,basket_add_list,basket_add_detail,image_picker,list_size_dropdown,closed_minibasket_click,sign_in,saw_checkout,saw_homepage,ordered
0,9d24-25k4-47889d24-25k4-494b-398124,0,0,0,0,0,0,0,0,0,0
1,7732-1k58-47887732-1k58-4475-679678,0,0,0,0,0,0,0,0,0,0
2,94k2-632j-471394k2-632j-4b4j-228160,0,0,0,0,0,0,0,0,0,0
3,jdd8-419d-4714jdd8-419d-4198-674376,0,0,1,0,0,0,0,0,0,0
4,7473-7595-47147473-7595-4757-227547,0,0,0,0,0,0,0,0,0,0


In [8]:
df_train_processed["ordered"].value_counts

<bound method IndexOpsMixin.value_counts of 0         0
1         0
2         0
3         0
4         0
         ..
151650    0
151651    0
151652    0
151653    0
151654    0
Name: ordered, Length: 151655, dtype: int64>

In [5]:
X = df_train_processed.drop(columns=['UserID','ordered'])
y = df_train_processed['ordered']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)


In [6]:
# Check class distribution in training and validation sets
def check_class_distribution(y_train, y_val):
    unique_train, counts_train = np.unique(y_train, return_counts=True)
    unique_val, counts_val = np.unique(y_val, return_counts=True)
    print("Training set class distribution:", dict(zip(unique_train, counts_train)))
    print("Validation set class distribution:", dict(zip(unique_val, counts_val)))

check_class_distribution(y_train, y_val)

Training set class distribution: {0: 121324}
Validation set class distribution: {0: 30331}


In [7]:
# List of models to evaluate
models = {
    "XGBoost": XGBClassifier(objective='binary:logistic', random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(random_state=42)
}

# Evaluate each model and log results with MLflow
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_val)
        
        # Evaluate the model
        accuracy = accuracy_score(y_val, y_pred)
        report = classification_report(y_val, y_pred)
        
        # Log model, parameters, and metrics
        mlflow.sklearn.log_model(model, "model")
        mlflow.log_params(model.get_params())
        mlflow.log_metric("accuracy", accuracy)
        
        # Print the evaluation report
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy}")
        print(report)
        print("="*80)

Model: XGBoost
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30331

    accuracy                           1.00     30331
   macro avg       1.00      1.00      1.00     30331
weighted avg       1.00      1.00      1.00     30331

Model: RandomForest
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30331

    accuracy                           1.00     30331
   macro avg       1.00      1.00      1.00     30331
weighted avg       1.00      1.00      1.00     30331





ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0