In [9]:
import pandas as pd

train_data = pd.read_csv(r"F:\Guvi\Clickstream_customer_conversion\train_data.csv")
test_data = pd.read_csv(r"F:\Guvi\Clickstream_customer_conversion\test_data.csv")

In [10]:
import pickle
le1 = pickle.load(open("le1_clothing_model.pkl", "rb"))
le2 = pickle.load(open("le2_clothing_model.pkl", "rb"))

In [11]:
train_data

Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,model_photography,price,price_2,page
0,2008,6,22,21,29,15648,3,C20,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,B26,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,C13,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,B11,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,B31,9,5,1,57,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132374,2008,7,4,3,29,17622,4,P19,2,1,1,48,1,2
132375,2008,6,19,9,29,15165,3,C26,14,3,1,28,2,2
132376,2008,7,15,4,29,19359,1,A4,3,2,2,38,2,1
132377,2008,7,28,16,29,21454,3,C50,9,5,2,20,2,3


In [12]:
train_data['page2_clothing_model'] = le1.transform(train_data['page2_clothing_model'])
test_data['page2_clothing_model'] = le2.transform(test_data['page2_clothing_model'])

In [13]:
train_data

Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,model_photography,price,price_2,page
0,2008,6,22,21,29,15648,3,88,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,60,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,80,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,45,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,66,9,5,1,57,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132374,2008,7,4,3,29,17622,4,145,2,1,1,48,1,2
132375,2008,6,19,9,29,15165,3,94,14,3,1,28,2,2
132376,2008,7,15,4,29,19359,1,33,3,2,2,38,2,1
132377,2008,7,28,16,29,21454,3,121,9,5,2,20,2,3


In [14]:
from sklearn.preprocessing import StandardScaler
train_features = train_data[['page1_main_category', 'page2_clothing_model', 'colour', 'order', 'price', 'location', 'model_photography']]
train_target = train_data['price_2']

test_features = test_data[['page1_main_category', 'page2_clothing_model', 'colour', 'order', 'price', 'location', 'model_photography']]
test_target = test_data['price_2']

scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [16]:
model_params = {
    "Logistic_Regression": (LogisticRegression(), {
        "C": [0.01, 0.1, 1, 10, 100],  
        "solver": ["liblinear", "lbfgs"] 
    }),
    
    "Random_Forest": (RandomForestClassifier(), {
        "n_estimators": [50, 100, 200],  
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10] 
    }),
    
    "Decision_Tree": (DecisionTreeClassifier(), {
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
        "criterion": ["gini", "entropy"]  
    })
}

In [17]:
reports = []

for name, (model, param_grid) in model_params.items():
    if param_grid: 
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(train_features, train_target)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(train_features, train_target)
        best_params = "Default Parameters"

    predictions = best_model.predict(test_features)
    accuracy = accuracy_score(test_target, predictions)
    report = classification_report(test_target, predictions)
    confusion = confusion_matrix(test_target, predictions)

    reports.append((name, best_model, best_params, accuracy, report, confusion))

In [18]:
for name, model, best_params, accuracy, report, confusion in reports:
    print(f"Model: {name}")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}")
    print(f"Confusion Matrix:\n{confusion}\n")

Model: Logistic_Regression
Best Parameters: {'C': 10, 'solver': 'liblinear'}
Accuracy: 0.9985
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     16981
           2       1.00      1.00      1.00     16114

    accuracy                           1.00     33095
   macro avg       1.00      1.00      1.00     33095
weighted avg       1.00      1.00      1.00     33095

Confusion Matrix:
[[16930    51]
 [    0 16114]]

Model: Random_Forest
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     16981
           2       1.00      1.00      1.00     16114

    accuracy                           1.00     33095
   macro avg       1.00      1.00      1.00     33095
weighted avg       1.00      1.00      1.00     33095

Confusion Matrix:
[[16981     0]
 [  

In [20]:
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

In [21]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Purchase_Classification_Models")

for name, model, best_params, accuracy, report, confusion in reports:
    with mlflow.start_run(run_name=name) as run:
        mlflow.sklearn.log_model(model, f"{name}_model")
        mlflow.log_params(best_params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_text(report, "classification_report.txt")
        mlflow.log_text(str(confusion), "confusion_matrix.txt")
        

2025/02/08 14:03:33 INFO mlflow.tracking.fluent: Experiment with name 'Purchase_Classification_Models' does not exist. Creating a new experiment.


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



🏃 View run Logistic_Regression at: http://127.0.0.1:5000/#/experiments/645988250444269521/runs/1ea1abafcd4a4e7888fcc0bd95fb7412
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/645988250444269521




🏃 View run Random_Forest at: http://127.0.0.1:5000/#/experiments/645988250444269521/runs/3b5189a832674e90a790d650c4bfbb38
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/645988250444269521




🏃 View run Decision_Tree at: http://127.0.0.1:5000/#/experiments/645988250444269521/runs/6e2b8c64337c44eba419cd13c7f8cd40
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/645988250444269521


In [22]:
model_name ='Random Forest Classifier'
run_id = '3b5189a832674e90a790d650c4bfbb38'
model_uri = f'runs:/{run_id}/Random_Forest_model'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri= model_uri , name= model_name)

Successfully registered model 'Random Forest Classifier'.
2025/02/08 14:06:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Classifier, version 1


🏃 View run Random_Forest at: http://127.0.0.1:5000/#/experiments/645988250444269521/runs/3b5189a832674e90a790d650c4bfbb38
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/645988250444269521


Created version '1' of model 'Random Forest Classifier'.


In [23]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
model_name = "Random Forest Classifier"
model_version = "1"
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.pyfunc.load_model(model_uri)

In [24]:
import pickle

with open("random_forest_classifier_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("classification_standard_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)