In [2]:
%pip install scikit-learn





[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

# Charger et nettoyer les données comme précédemment
df_train = pd.read_csv("https://raw.githubusercontent.com/dataafriquehub/donnee_vente/refs/heads/main/train.csv")
df_submission = pd.read_csv("https://raw.githubusercontent.com/dataafriquehub/donnee_vente/refs/heads/main/submission.csv")

# Nettoyage des données
df_train = df_train.dropna()

# Feature Engineering avancé
df_train['date'] = pd.to_datetime(df_train['date'])
df_train['year'] = df_train['date'].dt.year
df_train['month'] = df_train['date'].dt.month
df_train['day'] = df_train['date'].dt.day
df_train['dayofweek'] = df_train['date'].dt.dayofweek
df_train['is_weekend'] = df_train['date'].dt.dayofweek.isin([5, 6]).astype(int)
df_train['quarter'] = df_train['date'].dt.quarter

# Ajout d'interactions entre variables
df_train['region_weather'] = df_train['region'] + "_" + df_train['condition_meteo']
df_train['season'] = pd.cut(df_train['month'], bins=[0,3,6,9,12], labels=['Winter', 'Spring', 'Summer', 'Fall'])

df_train.drop(['date', 'id_produit'], axis=1, inplace=True)

# Encodage des variables catégoriques
df_train = pd.get_dummies(df_train, columns=['categorie', 'marque', 'condition_meteo', 'region', 
                                            'moment_journee', 'region_weather', 'season'], drop_first=True)

# Séparation des caractéristiques et de la cible
X = df_train.drop(['quantite_vendue'], axis=1)
y = df_train['quantite_vendue']

# Normalisation des caractéristiques
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split des données
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Définition des hyperparamètres pour la recherche
params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Recherche des meilleurs hyperparamètres
rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=params,
                                 n_iter=20, cv=5, random_state=42, n_jobs=-1,
                                 scoring='neg_mean_absolute_percentage_error')
random_search.fit(X_train, y_train)

# Utilisation du meilleur modèle
best_rf = random_search.best_estimator_

# Entraînement des autres modèles
gbr = GradientBoostingRegressor(random_state=42)
abr = AdaBoostRegressor(random_state=42)

gbr.fit(X_train, y_train)
abr.fit(X_train, y_train)

# Ensemble learning
ensemble_model = VotingRegressor(estimators=[('rf', best_rf), ('gbr', gbr), ('abr', abr)])
ensemble_model.fit(X_train, y_train)

# Prédictions
y_pred = ensemble_model.predict(X_test)
print(f"MAPE avec l'ensemble learning: {mean_absolute_percentage_error(y_test, y_pred):.2f}")

# Importance des caractéristiques
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 des caractéristiques les plus importantes:")
print(feature_importance.head(10))

# Préparation des données de soumission
submission_ids = df_submission['id_produit'].copy()
df_submission['date'] = pd.to_datetime(df_submission['date'])
df_submission['year'] = df_submission['date'].dt.year
df_submission['month'] = df_submission['date'].dt.month
df_submission['day'] = df_submission['date'].dt.day
df_submission['dayofweek'] = df_submission['date'].dt.dayofweek
df_submission['is_weekend'] = df_submission['date'].dt.dayofweek.isin([5, 6]).astype(int)
df_submission['quarter'] = df_submission['date'].dt.quarter
df_submission['region_weather'] = df_submission['region'] + "_" + df_submission['condition_meteo']
df_submission['season'] = pd.cut(df_submission['month'], bins=[0,3,6,9,12], labels=['Winter', 'Spring', 'Summer', 'Fall'])

df_submission.drop(['date', 'id_produit'], axis=1, inplace=True)
df_submission = pd.get_dummies(df_submission, columns=['categorie', 'marque', 'condition_meteo', 'region', 
                                                      'moment_journee', 'region_weather', 'season'], drop_first=True)

# Aligner les colonnes avec l'ensemble d'entraînement
X_submission = df_submission.reindex(columns=X.columns, fill_value=0)
X_submission_scaled = scaler.transform(X_submission)

# Prédictions finales
y_submission_pred = ensemble_model.predict(X_submission_scaled)

# Création du fichier de soumission
final_submission = pd.DataFrame({
    'id_produit': submission_ids,
    'quantite_vendue': y_submission_pred
})

final_submission.to_csv('submission.csv', index=False)

MAPE avec l'ensemble learning: 0.23

Meilleurs paramètres trouvés: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}

Top 10 des caractéristiques les plus importantes:
                             feature  importance
2                          promotion    0.229353
32                      region_Rural    0.105679
33                     region_Urbain    0.060902
5                   stock_disponible    0.053758
0                         Unnamed: 0    0.050505
1                      prix_unitaire    0.048558
8                                day    0.041814
43      region_weather_Rural_Orageux    0.027849
7                              month    0.025662
45  region_weather_Urbain_Ensoleillé    0.023190


