In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve




import joblib

pd.options.display.max_columns = None

In [None]:
# path of the folder with .parquet
folder_path = Path("D:/Users/maick/Desktop/Codigos/zrive-ds/data/box_builder_dataset/feature_frame.csv")

In [None]:
feature_frame = pd.read_csv(folder_path)
feature_frame

# Columns
* variant id............................: -> Not include, inside of the model will be the caracteristics 
* product_type..........................: -> One Hot encoding
* order_id..............................:
* user_id...............................:
* created_at..........................: are the same row, have the hour, it's innecesary -> *Delete*
* order_date..........................: There isn't any related to the time, I think it's innecesary here -> *Delete*
* user_order_seq.....................:
* outcome............................:
* ordered_before.....................:
* abandoned_before...................:
* active_snoozed.....................:
* set_as_regular.....................:
* normalised_price......................:
* discount_pct..........................:
* vendor................................:
* global_popularity..................:
* count_adults..........................:|
* count_children........................:|
* count_babies..........................:|-Datos imputados
* count_pets............................:|
* people_ex_baby........................:|
* days_since_purchase_variant_id.....:
* avg_days_to_buy_variant_id.........:
* std_days_to_buy_variant_id.......:
* days_since_purchase_product_type.:
* avg_days_to_buy_product_type.....:
* std_days_to_buy_product_type.....:


Filter for >5 products

In [None]:
size_of_order = feature_frame.groupby("order_id").outcome.sum()
size_of_order = size_of_order[size_of_order>=5]
feature_frame = feature_frame[feature_frame["order_id"].isin(size_of_order.index)]
feature_frame

feature_frame["created_at"] = pd.to_datetime(feature_frame["created_at"])
feature_frame["created_at"] = pd.to_datetime(feature_frame["created_at"].dt.date)

In [None]:
(feature_frame["outcome"]).value_counts()

In [None]:
#Delete the half of zeros:
feature_frame = feature_frame.drop(feature_frame["outcome"][feature_frame["outcome"]== 0].sample(frac=0.5, random_state=42).index)

### Product type

In [None]:
#Product type: need to be categorical encoding
feature_frame = pd.concat([feature_frame, pd.get_dummies(feature_frame["product_type"])],axis=1)
feature_frame = feature_frame.drop("product_type",axis=1)
feature_frame = feature_frame.drop("index",axis=1)

### order_ID

In [None]:
#Count encoding
feature_frame['order_id'] = feature_frame.order_id.map(feature_frame.order_id.value_counts())

In [None]:
#Categorical encoding
feature_frame = pd.concat([feature_frame, pd.get_dummies(feature_frame["order_id"])],axis=1)
feature_frame = feature_frame.drop("order_id",axis=1)
feature_frame

### vendor

In [None]:
#categorical encoding
feature_frame = pd.concat([feature_frame, pd.get_dummies(feature_frame["vendor"])],axis=1)
feature_frame = feature_frame.drop("vendor",axis=1)
#feature_frame = feature_frame.drop("index",axis=1)

In [None]:
#Count encoding
counts = feature_frame.vendor.value_counts()
feature_frame['vendor'] = feature_frame.vendor.map(counts)

### Delete: 
* user_id
* variant_id
* order_date
* created_at

In [None]:
feature_frame = feature_frame.drop("variant_id",axis=1)
feature_frame = feature_frame.drop("user_id",axis=1)
feature_frame = feature_frame.drop("order_date",axis=1)
feature_frame = feature_frame.drop("created_at",axis=1)

### Preparing the Dataframe

In [None]:
feature_frame = feature_frame*1
feature_frame.columns = feature_frame.columns.astype(str)

In [None]:
scaler = MinMaxScaler()
feature_frame_normalized = scaler.fit_transform(feature_frame)
feature_frame_normalized = pd.DataFrame(feature_frame_normalized, columns=feature_frame.columns)
feature_frame_normalized

## Divide the dataset

70 - 20 - 10


In [None]:
y = feature_frame_normalized["outcome"]
x = feature_frame_normalized.drop(columns=["outcome"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=42)
#
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train, random_state=42)

print(f"Train: {len(X_train)} ")
print(f"Valid: {len(X_valid)} ")
print(f"Test: {len(X_test)} ") 

### LogisticRegression

In [None]:
model = LogisticRegression(
    class_weight="balanced", 
    random_state=42, 
    max_iter=500
    )

model.fit(X_train, y_train)

In [None]:
#error log-loss logaritmic loss o binary cross entropy

y_train_probs = model.predict_proba(X_train)[:, 1]  # Probabilidad de la clase 1
train_loss = log_loss(y_train, y_train_probs)
print("Log Loss en entrenamiento:", train_loss)

In [None]:
y_test_probs = model.predict_proba(X_test)[:, 1]
test_loss = log_loss(y_test, y_test_probs)
print("Log Loss en validación:", test_loss)

In [None]:
# Obtener probabilidades de predicción
y_probs = model.predict_proba(X_test)[:, 1]  # Probabilidad de la clase 1

# Calcular la curva ROC
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

# Graficar la curva ROC
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Línea diagonal (azar)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()


In [None]:
# Calcular precisión y recall para diferentes umbrales
precision, recall, _ = precision_recall_curve(y_test, y_probs)

# Graficar
plt.figure(figsize=(8,6))
plt.plot(recall, precision, marker='.', color='green', label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

In [None]:

# Calcular precisión y recall para diferentes umbrales
precision, recall, _ = precision_recall_curve(y_test, y_probs)

# Graficar
plt.figure(figsize=(8,6))
plt.plot(recall, precision, marker='.', color='green', label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()


### RandomForestClassifier

In [None]:
# Definir el modelo con ajuste de pesos
model = RandomForestClassifier(
    n_estimators=100,   # Número de árboles
    max_depth=10,       # Profundidad máxima (ajústalo si es necesario)
    class_weight="balanced",  # Manejo del desbalance
    random_state=42,
    n_jobs=-1  # Usa todos los núcleos disponibles
    )

# Entrenar el modelo
model.fit(X_train, y_train)

### Test the model

In [None]:
y_valid_pred = model.predict(X_valid)
print("Validación:\n", classification_report(y_valid, y_valid_pred))

In [None]:
y_test_pred = model.predict(X_test)
print("Validación:\n", classification_report(y_test, y_test_pred))

### Save the model

In [None]:
joblib.dump(model, "models/model_MVP_RandomForestClassifier_01.pkl")