In [14]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.figure
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve

import joblib

pd.options.display.max_columns = None

## Get the data

In [17]:
# path of the folder with .parquet
#folder_path = Path("D:/Users/maick/Desktop/Codigos/zrive-ds/data/box_builder_dataset/feature_frame.csv")
folder_path = Path("C:/Users/AULA04/Desktop/Codes/zrive-ds/data/box_builder_dataset/feature_frame.csv")


In [30]:
def plot_metrics(
        model_name: str,
        y_pred: pd.Series,
        y_test: pd.Series,
        figure: tuple[matplotlib.figure.Figure, np.array] = None
):
    precision_, recall_, _ = precision_recall_curve(y_test, y_pred)
    pr_auc = auc(recall_, precision_)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    if figure is None:
        fig, ax = plt.subplots(1,2, figsize=(14,7))
    else:
        fig, ax = figure
    
    ax[0].plot(recall_, precision_, label=f"{model_name}: AUC:{pr_auc:.2f}")
    ax[0].set_xlabel("recall")
    ax[0].set_ylabel("precision")
    ax[0].set_title(F"precision-recall Curve")
    ax[0].legend()

    ax[1].plot(fpr,tpr,label=f"AUC: {roc_auc:.2f}")
    ax[1].set_xlabel("FPR")
    ax[1].set_ylabel("TPR")
    ax[1].set_title(f"ROC Curve")
    ax[1].legend()

In [3]:
feature_frame = pd.read_csv(folder_path)
feature_frame

Unnamed: 0,variant_id,product_type,order_id,user_id,created_at,order_date,user_order_seq,outcome,ordered_before,abandoned_before,active_snoozed,set_as_regular,normalised_price,discount_pct,vendor,global_popularity,count_adults,count_children,count_babies,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,std_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,std_days_to_buy_product_type
0,33826472919172,ricepastapulses,2807985930372,3482464092292,2020-10-05 16:46:19,2020-10-05 00:00:00,3,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
1,33826472919172,ricepastapulses,2808027644036,3466586718340,2020-10-05 17:59:51,2020-10-05 00:00:00,2,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
2,33826472919172,ricepastapulses,2808099078276,3481384026244,2020-10-05 20:08:53,2020-10-05 00:00:00,4,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
3,33826472919172,ricepastapulses,2808393957508,3291363377284,2020-10-06 08:57:59,2020-10-06 00:00:00,2,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
4,33826472919172,ricepastapulses,2808429314180,3537167515780,2020-10-06 10:37:05,2020-10-06 00:00:00,3,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2880544,33826439594116,healthcarevitamins,3643254800516,3893722808452,2021-03-03 13:19:28,2021-03-03 00:00:00,3,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392
2880545,33826439594116,healthcarevitamins,3643274788996,3883757174916,2021-03-03 13:57:35,2021-03-03 00:00:00,4,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392
2880546,33826439594116,healthcarevitamins,3643283734660,3874925314180,2021-03-03 14:14:24,2021-03-03 00:00:00,7,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392
2880547,33826439594116,healthcarevitamins,3643294515332,3906490826884,2021-03-03 14:30:30,2021-03-03 00:00:00,2,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392


## Columns

Filter for >5 products

In [18]:
size_of_order = feature_frame.groupby("order_id").outcome.sum()
size_of_order = size_of_order[size_of_order>=5]
feature_frame = feature_frame[feature_frame["order_id"].isin(size_of_order.index)]
feature_frame

Unnamed: 0,variant_id,product_type,order_id,user_id,created_at,order_date,user_order_seq,outcome,ordered_before,abandoned_before,active_snoozed,set_as_regular,normalised_price,discount_pct,vendor,global_popularity,count_adults,count_children,count_babies,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,std_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,std_days_to_buy_product_type
0,33826472919172,ricepastapulses,2807985930372,3482464092292,2020-10-05 16:46:19,2020-10-05,3,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
1,33826472919172,ricepastapulses,2808027644036,3466586718340,2020-10-05 17:59:51,2020-10-05,2,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
2,33826472919172,ricepastapulses,2808099078276,3481384026244,2020-10-05 20:08:53,2020-10-05,4,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
3,33826472919172,ricepastapulses,2808393957508,3291363377284,2020-10-06 08:57:59,2020-10-06,2,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
5,33826472919172,ricepastapulses,2808434524292,3479090790532,2020-10-06 10:50:23,2020-10-06,3,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2880541,33826439594116,healthcarevitamins,3643241300100,3864791220356,2021-03-03 12:56:04,2021-03-03,2,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392
2880544,33826439594116,healthcarevitamins,3643254800516,3893722808452,2021-03-03 13:19:28,2021-03-03,3,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392
2880545,33826439594116,healthcarevitamins,3643274788996,3883757174916,2021-03-03 13:57:35,2021-03-03,4,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392
2880546,33826439594116,healthcarevitamins,3643283734660,3874925314180,2021-03-03 14:14:24,2021-03-03,7,0.0,0.0,0.0,0.0,0.0,0.417186,0.114360,colief,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392


pass to datetime

In [19]:
feature_frame["created_at"] = pd.to_datetime(feature_frame["created_at"])
feature_frame["order_date"] = pd.to_datetime(feature_frame["order_date"]).dt.date

In [20]:
(feature_frame["outcome"]).value_counts()

outcome
0.0    2132624
1.0      31329
Name: count, dtype: int64

In [None]:
#Delete the half of zeros:
#feature_frame = feature_frame.drop(feature_frame["outcome"][feature_frame["outcome"]== 0].sample(frac=0.5, random_state=42).index)

### Product type

In [29]:
#Product type: need to be categorical encoding
feature_frame = pd.concat([feature_frame, pd.get_dummies(feature_frame["product_type"])],axis=1)
feature_frame = feature_frame.drop("product_type",axis=1)
#feature_frame = feature_frame.drop("index",axis=1)

### order_ID

In [30]:
#Count encoding
feature_frame['order_id'] = feature_frame.order_id.map(feature_frame.order_id.value_counts())

### vendor

In [31]:
#Count encoding
feature_frame['vendor'] = feature_frame.vendor.map(feature_frame.vendor.value_counts())

#categorical encoding
#feature_frame = pd.concat([feature_frame, pd.get_dummies(feature_frame["vendor"])],axis=1)
#feature_frame = feature_frame.drop("vendor",axis=1)
#feature_frame = feature_frame.drop("index",axis=1)

### Delete: 
* user_id
* variant_id
* order_date
* created_at

In [None]:
feature_frame = feature_frame.drop("variant_id",axis=1)
feature_frame = feature_frame.drop("user_id",axis=1)
feature_frame = feature_frame.drop("order_date",axis=1)
feature_frame = feature_frame.drop("created_at",axis=1)
feature_frame = feature_frame.drop("order_id",axis=1)



In [22]:
feature_frame.head(1)

Unnamed: 0,product_type,user_order_seq,outcome,ordered_before,abandoned_before,active_snoozed,set_as_regular,normalised_price,discount_pct,vendor,global_popularity,count_adults,count_children,count_babies,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,std_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,std_days_to_buy_product_type
0,ricepastapulses,3,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.0,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618


### Separate the dataframe

In [26]:
feature_frame_to_train = feature_frame[
    ["ordered_before","abandoned_before","active_snoozed",
     "set_as_regular","normalised_price","discount_pct",
     "global_popularity","count_adults","count_children",
     "count_babies","count_pets","people_ex_baby",
     "days_since_purchase_variant_id","avg_days_to_buy_variant_id",
     "std_days_to_buy_variant_id","days_since_purchase_product_type",
     "avg_days_to_buy_product_type","std_days_to_buy_product_type","outcome"]
]
feature_frame_to_train

Unnamed: 0,ordered_before,abandoned_before,active_snoozed,set_as_regular,normalised_price,discount_pct,global_popularity,count_adults,count_children,count_babies,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,std_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,std_days_to_buy_product_type,outcome
0,0.0,0.0,0.0,0.0,0.081052,0.053512,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180,0.0
1,0.0,0.0,0.0,0.0,0.081052,0.053512,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180,0.0
2,0.0,0.0,0.0,0.0,0.081052,0.053512,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180,0.0
3,0.0,0.0,0.0,0.0,0.081052,0.053512,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180,0.0
5,0.0,0.0,0.0,0.0,0.081052,0.053512,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.276180,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2880541,0.0,0.0,0.0,0.0,0.417186,0.114360,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392,0.0
2880544,0.0,0.0,0.0,0.0,0.417186,0.114360,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392,0.0
2880545,0.0,0.0,0.0,0.0,0.417186,0.114360,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392,0.0
2880546,0.0,0.0,0.0,0.0,0.417186,0.114360,0.000000,2.0,0.0,0.0,0.0,2.0,33.0,34.0,27.693045,30.0,34.0,27.451392,0.0


In [27]:
feature_frame_to_train = feature_frame_to_train*1
feature_frame_to_train.columns = feature_frame_to_train.columns.astype(str)

scaler = MinMaxScaler()
feature_frame_normalized = scaler.fit_transform(feature_frame_to_train)
feature_frame_normalized = pd.DataFrame(feature_frame_normalized, columns=feature_frame_to_train.columns)
feature_frame_normalized

Unnamed: 0,ordered_before,abandoned_before,active_snoozed,set_as_regular,normalised_price,discount_pct,global_popularity,count_adults,count_children,count_babies,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,std_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,std_days_to_buy_product_type,outcome
0,0.0,0.0,0.0,0.0,0.066116,0.068601,0.000000,0.25,0.0,0.0,0.0,0.25,0.222973,0.500000,0.518891,0.202703,0.707692,0.653626,0.0
1,0.0,0.0,0.0,0.0,0.066116,0.068601,0.000000,0.25,0.0,0.0,0.0,0.25,0.222973,0.500000,0.518891,0.202703,0.707692,0.653626,0.0
2,0.0,0.0,0.0,0.0,0.066116,0.068601,0.000000,0.25,0.0,0.0,0.0,0.25,0.222973,0.500000,0.518891,0.202703,0.707692,0.653626,0.0
3,0.0,0.0,0.0,0.0,0.066116,0.068601,0.090404,0.25,0.0,0.0,0.0,0.25,0.222973,0.500000,0.518891,0.202703,0.707692,0.653626,0.0
4,0.0,0.0,0.0,0.0,0.066116,0.068601,0.090404,0.25,0.0,0.0,0.0,0.25,0.222973,0.500000,0.518891,0.202703,0.707692,0.653626,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2163948,0.0,0.0,0.0,0.0,0.407713,0.113164,0.000000,0.25,0.0,0.0,0.0,0.25,0.222973,0.404762,0.458813,0.202703,0.830769,0.750392,0.0
2163949,0.0,0.0,0.0,0.0,0.407713,0.113164,0.000000,0.25,0.0,0.0,0.0,0.25,0.222973,0.404762,0.458813,0.202703,0.830769,0.750392,0.0
2163950,0.0,0.0,0.0,0.0,0.407713,0.113164,0.000000,0.25,0.0,0.0,0.0,0.25,0.222973,0.404762,0.458813,0.202703,0.830769,0.750392,0.0
2163951,0.0,0.0,0.0,0.0,0.407713,0.113164,0.000000,0.25,0.0,0.0,0.0,0.25,0.222973,0.404762,0.458813,0.202703,0.830769,0.750392,0.0


## Divide the dataset

In [28]:
y = feature_frame_normalized["outcome"]
x = feature_frame_normalized.drop(columns=["outcome"])

In [39]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train, random_state=42)

print(f"Train: {len(X_train)} ")
print(f"Valid: {len(X_valid)} ")
print(f"Test: {len(X_test)} ") 

Train: 1558045 
Valid: 389512 
Test: 216396 


### LogisticRegression

In [34]:
LogReg_model = LogisticRegression(
    class_weight="balanced", 
    random_state=42, 
    max_iter=500
    )

LogReg_model.fit(X_train, y_train)

In [41]:
y_pred_LogReg = LogReg_model.predict(X_valid)
print("Validación:\n", classification_report(y_valid, y_pred_LogReg))

Validación:
               precision    recall  f1-score   support

         0.0       0.99      0.85      0.92    383873
         1.0       0.06      0.63      0.11      5639

    accuracy                           0.85    389512
   macro avg       0.53      0.74      0.51    389512
weighted avg       0.98      0.85      0.91    389512



In [36]:
#error log-loss logaritmic loss o binary cross entropy

y_train_probs = LogReg_model.predict_proba(X_train)[:, 1]  # Probabilidad de la clase 1
train_loss = log_loss(y_train, y_train_probs)
print("Log Loss en entrenamiento:", train_loss)

Log Loss en entrenamiento: 0.5167879039319374


In [37]:
y_test_probs = LogReg_model.predict_proba(X_test)[:, 1]
test_loss = log_loss(y_test, y_test_probs)
print("Log Loss en validación:", test_loss)

Log Loss en validación: 0.5151003625474884


In [None]:
plot_metrics("LogisticRegression", y_pred=X_train, y_test=y_train)

ValueError: Found input variables with inconsistent numbers of samples: [1558045, 389512]

### RandomForestClassifier

In [None]:
# Definir el modelo con ajuste de pesos
model = RandomForestClassifier(
    n_estimators=100,   # Número de árboles
    max_depth=10,       # Profundidad máxima (ajústalo si es necesario)
    class_weight="balanced",  # Manejo del desbalance
    random_state=42,
    n_jobs=-1  # Usa todos los núcleos disponibles
    )

# Entrenar el modelo
model.fit(X_train, y_train)

### Test the model

In [None]:
y_valid_pred = model.predict(X_valid)
print("Validación:\n", classification_report(y_valid, y_valid_pred))

In [None]:
y_test_pred = model.predict(X_test)
print("Validación:\n", classification_report(y_test, y_test_pred))

### Save the model

In [None]:
joblib.dump(model, "models/model_MVP_RandomForestClassifier_01.pkl")