In [1]:
import pandas as pd
import numpy as np
import random

# --- Étape 1 : Création de products_data ---
num_products = 50
products_data = []

departments = ["Department_25", "Department_31", "Department_10"]
classes = {"Department_25": ["Class_2570", "Class_2571"], "Department_31": ["Class_3123", "Class_3124"], "Department_10": ["Class_1050"]}
subclasses = {cls: [f"SubClass_{i}" for i in range(10000, 10005)] for cls in sum(classes.values(), [])}
brands = ["Brand_A", "Brand_B", "Brand_C", "Brand_D"]

for i in range(num_products):
    department = random.choice(departments)
    class_key = random.choice(classes[department])
    subclass_key = random.choice(subclasses[class_key])
    brand = random.choice(brands)
    product_id = f"Product_{10000 + i}"
    description = f"{random.randint(100, 1000)}G {random.choice(['Biscuits', 'Pâtes', 'Savons'])}"
    products_data.append({
        "product_id": product_id,
        "product_description": description,
        "department_key": department,
        "class_key": class_key,
        "subclass_key": subclass_key,
        "sector": "PGC",
        "brand_key": brand,
        "shelf_level1": department,
        "shelf_level2": class_key,
        "shelf_level3": subclass_key,
        "shelf_level4": None,
        "bio": random.choice([0, 1]),
        "sugar_free": random.choice([0, 1]),
        "gluten_free": random.choice([0, 1]),
        "vegan": random.choice([0, 1]),
        "lactose_free": random.choice([0, 1]),
        "carrefour_brand": random.choice([0, 1])
    })

products_data = pd.DataFrame(products_data)

# --- Étape 2 : Création des préférences clients ---
n_customers = 500
customer_ids = [f"Household_{i}" for i in range(1, n_customers + 1)]

# Associer des clients à leurs produits préférés
customer_preferences = {
    customer: random.sample(products_data['product_id'].tolist(), random.randint(5, 10)) for customer in customer_ids
}

# --- Étape 3 : Générer train_data ---
n_train = 10000
start_date = pd.to_datetime("2022-01-01")
end_date = pd.to_datetime("2023-12-31")

train_data = []
for _ in range(n_train):
    customer_id = random.choice(customer_ids)
    product_id = random.choice(customer_preferences[customer_id])
    date = pd.to_datetime(np.random.randint(start_date.value, end_date.value), unit='ns')
    train_data.append({
        "date": date.strftime('%Y-%m-%d'),
        "transaction_id": f"Transaction_{random.randint(1, 1000000)}",
        "customer_id": customer_id,
        "product_id": product_id,
        "has_loyalty_card": random.choice([0, 1]),
        "store_id": f"Store_{random.randint(1, 10)}",
        "is_promo": random.choice([0, 1]),
        "quantity": random.randint(1, 5),
        "format": random.choice(["DRIVE", "DELIVERY", "PICKUP"]),
        "order_channel": random.choice(["MOBILE_APP", "WEBSITE", "IN_STORE"])
    })

train_data = pd.DataFrame(train_data)

# --- Étape 4 : Générer test_data basé sur train_data ---
n_test = 2000
test_data = []

for _ in range(n_test):
    customer_id = random.choice(customer_ids)
    if random.random() < 0.8:
        product_id = random.choice(customer_preferences[customer_id])
    else:
        product_id = random.choice(products_data['product_id'].tolist())
    test_data.append({
        "transaction_id": f"Transaction_{random.randint(1, 1000000)}",
        "customer_id": customer_id,
        "product_id": product_id
    })

test_data = pd.DataFrame(test_data)

# --- Validation ---
print("Products Data Sample:")
print(products_data.head())
print("\nTrain Data Sample:")
print(train_data.head())
print("\nTest Data Sample:")
print(test_data.head())


Products Data Sample:
      product_id product_description department_key   class_key  \
0  Product_10000       841G Biscuits  Department_10  Class_1050   
1  Product_10001          629G Pâtes  Department_31  Class_3123   
2  Product_10002       831G Biscuits  Department_25  Class_2570   
3  Product_10003       102G Biscuits  Department_25  Class_2570   
4  Product_10004       477G Biscuits  Department_31  Class_3124   

     subclass_key sector brand_key   shelf_level1 shelf_level2  \
0  SubClass_10000    PGC   Brand_C  Department_10   Class_1050   
1  SubClass_10003    PGC   Brand_D  Department_31   Class_3123   
2  SubClass_10003    PGC   Brand_B  Department_25   Class_2570   
3  SubClass_10001    PGC   Brand_B  Department_25   Class_2570   
4  SubClass_10002    PGC   Brand_A  Department_31   Class_3124   

     shelf_level3 shelf_level4  bio  sugar_free  gluten_free  vegan  \
0  SubClass_10000         None    1           0            1      1   
1  SubClass_10003         None    0 

In [2]:
!pip install scikit-learn --upgrade



In [3]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


# Séparer X et y
X = train_data.drop(columns=["product_id"])  # Toutes les colonnes sauf product_id
y = train_data["product_id"]  # La cible (product_id)

# Encodage de la variable cible (product_id)
product_encoder = LabelEncoder()
y_encoded = product_encoder.fit_transform(y)

# Encodage des colonnes catégorielles dans X
categorical_columns = X.select_dtypes(include=["object"]).columns

# Appliquer OneHotEncoder sur les colonnes catégorielles
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore") # sparse was deprecated in 1.2 and replaced with sparse_output
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_columns]))

# Ajouter les colonnes numériques
numerical_columns = X.select_dtypes(exclude=["object"]).columns
X_encoded = X_encoded.astype(np.float32)

# Conversion en matrices numpy
X_train = X_encoded.to_numpy()
y_train = to_categorical(y_encoded, num_classes=len(product_encoder.classes_))

print(f"Dimensions de X_train : {X_train.shape}")
print(f"Dimensions de y_train : {y_train.shape}")


Dimensions de X_train : (10000, 11199)
Dimensions de y_train : (10000, 50)


In [4]:
# Encodage des données de test
X_test = test_data[['transaction_id', 'customer_id']]  # Select only the columns present in test_data and used for training

# Get categorical columns present in both train and test data
categorical_columns_test = list(set(categorical_columns) & set(X_test.columns))

# Create missing columns in X_test and fill with a placeholder value (e.g., 0)
# We fill with 0 because the numerical columns are related to the training data and do not exist in the test data.
# Instead of 'missing', we use 0 to represent the absence of these features in the test data.
for col in numerical_columns:
    if col not in X_test.columns:
        X_test[col] = 0

# Create missing columns in X_test and fill with a placeholder value (e.g., 'missing')
for col in categorical_columns:
    if col not in X_test.columns:
        X_test[col] = 'missing'  # Or any other suitable placeholder

# Now you can safely apply the transform
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_columns]))

# Continue with the rest of your code...
x_test_final = pd.concat([X_test_encoded, X_test[numerical_columns].reset_index(drop=True)], axis=1)
X_test_final = x_test_final.to_numpy()

In [6]:
def predict_product(model, top_n=10):
    """
    Génère les produits recommandés pour chaque instance dans X_train.

    Args:
        model: Le modèle entraîné.
        top_n: Le nombre de produits recommandés (par défaut : 10).

    Returns:
        Une liste de produits recommandés pour chaque exemple d'entraînement.
    """
    # Prédire les probabilités pour les données d'entraînement
    prediction = model.predict(X_train)

    # Obtenir les indices des produits les plus probables
    top_products = np.argsort(prediction, axis=1)[:, -top_n:][:, ::-1]

    # Décoder les produits pour retourner les identifiants originaux
    return [product_encoder.inverse_transform(top_products[i]) for i in range(len(top_products))]


In [11]:
def hitrate_at_k(test_data, predicted_products, k=10):
    """
    Calcule le Hitrate@K.

    Args:
        test_data: Le jeu de données de test contenant les produits réellement achetés.
        predicted_products: Liste des produits recommandés par le modèle.
        k: Le nombre de recommandations à considérer (par défaut : 10).

    Returns:
        Le score Hitrate@K.
    """
    hits = 0

    for i, true_product in enumerate(test_data["product_id"]):
        if true_product in predicted_products[i][:k]:  # Vérifier si le vrai produit est dans les top K recommandations
            hits += 1

    return hits / len(test_data)


# Calcul du Hitrate@10
hitrate = hitrate_at_k(test_data, predicted_products, k=10)
print(f"Hitrate@10 : {hitrate:.2f}")


Hitrate@10 : 0.56


In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

# Assuming X_train and X_test are your DataFrames

# Convert X_train and X_test back to DataFrames if they are numpy arrays
# Make sure to provide the original columns if they were lost during the conversion to numpy
# For example:
# original_columns = [...]  # list of your original column names
# X_train = pd.DataFrame(X_train, columns=original_columns)
# X_test = pd.DataFrame(X_test, columns=original_columns)

# 1. Identify categorical and numerical columns:
categorical_cols = X.select_dtypes(include=['object']).columns # Use the original DataFrame 'X'
numerical_cols = X.select_dtypes(exclude=['object']).columns # Use the original DataFrame 'X'

# 2. Encode categorical features:
# Create a OneHotEncoder instance with handle_unknown='ignore' to avoid errors on new categories in the test data
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X[categorical_cols]) # Use the original DataFrame 'X'
X_test_encoded = encoder.transform(X_test[categorical_cols]) # Use transform, not fit_transform, for the test data

# 3. Combine numerical and encoded categorical features:
X_train_final = np.concatenate([X[numerical_cols].values, X_train_encoded], axis=1) # Use the original DataFrame 'X'
X_test_final = np.concatenate([X_test[numerical_cols].values, X_test_encoded], axis=1)

# 4. Now apply PCA:
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_final)
X_test_pca = pca.transform(X_test_final)

### LGBM

In [8]:
import lightgbm as lgb

model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=len(product_encoder.classes_),
    max_depth=4,
    learning_rate=0.2,
    n_estimators=50,
    n_jobs=-1
)
model.fit(X_train_pca, y_encoded)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 100
[LightGBM] [Info] Start training from score -3.825845
[LightGBM] [Info] Start training from score -4.062846
[LightGBM] [Info] Start training from score -3.763603
[LightGBM] [Info] Start training from score -4.104395
[LightGBM] [Info] Start training from score -3.789805
[LightGBM] [Info] Start training from score -3.932226
[LightGBM] [Info] Start training from score -3.947650
[LightGBM] [Info] Start training from score -3.858482
[LightGBM] [Info] Start training from score -3.968593
[LightGBM] [Info] Start training from score -3.887330
[LightGBM] [Info] Start training from score -3.937341
[LightGBM] [Info] Start training from score -4.098353
[LightGBM] [Info] Start training from score -4.104395
[LightG



In [9]:
proba = model.predict_proba(X_test_pca)
top_k_products = np.argsort(proba, axis=1)[:, -10:][:, ::-1]
predicted_products = [
    product_encoder.inverse_transform(top_k_products[i])
    for i in range(len(top_k_products))
]




In [12]:
hitrate = hitrate_at_k(test_data, predicted_products, k=10)
print(f"Hitrate@10 : {hitrate:.2f}")


Hitrate@10 : 0.56


### XGboost

In [13]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Créez un modèle XGBoost
model_xgb = xgb.XGBClassifier(
    objective="multi:softmax",  # Classification multi-classes
    num_class=len(product_encoder.classes_),  # Le nombre de classes (produits)
    max_depth=4,  # Profondeur réduite pour éviter le surapprentissage
    learning_rate=0.2,  # Taux d'apprentissage
    n_estimators=50,  # Nombre d'arbres
    n_jobs=-1,  # Utiliser plusieurs cœurs pour l'entraînement
    tree_method="hist",  # Méthode optimisée pour les grands ensembles de données
    use_label_encoder=False  # Désactive l'encodeur de labels de XGBoost, car ce n'est plus nécessaire
)

# Entraînement avec XGBoost sur les données PCA
model_xgb.fit(X_train_pca, y_encoded)

# Prédictions pour les données de test
proba_xgb = model_xgb.predict_proba(X_test_pca)

# Obtenez les indices des produits les plus probables
top_k_products_xgb = np.argsort(proba_xgb, axis=1)[:, -10:][:, ::-1]

# Décoder les produits pour obtenir les identifiants originaux
predicted_products_xgb = [
    product_encoder.inverse_transform(top_k_products_xgb[i])
    for i in range(len(top_k_products_xgb))
]

# Calculer le Hitrate@10
hitrate_xgb = hitrate_at_k(test_data, predicted_products_xgb, k=10)
print(f"Hitrate@10 avec XGBoost : {hitrate_xgb:.2f}")


Parameters: { "use_label_encoder" } are not used.



Hitrate@10 avec XGBoost : 0.58
