<a href="https://colab.research.google.com/github/NassimZahri/Data_Mining/blob/main/03_transformation_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 03 — Transformation & Feature Engineering
Encodage catégoriel, scaling, binning, variables temporelles, gestion des outliers, transformations log, etc.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load directly from GitHub raw URLs
base_url = 'https://raw.githubusercontent.com/NassimZahri/Data_Mining/main/data/'

# Load all CSV files
ventes = pd.read_csv(base_url + 'ventes.csv', parse_dates=['date'])
produits = pd.read_csv(base_url + 'produits.csv')
clients = pd.read_csv(base_url + 'clients.csv')
avis = pd.read_csv(base_url + 'avis.csv')

# Create the merged dataframe
df = ventes.merge(produits[['product_id','category']], on='product_id', how='left').copy()
df['total'] = df['price'] * df['quantity']
df['month'] = df['date'].dt.month
df['dow'] = df['date'].dt.dayofweek
df.head()

Unnamed: 0,date,store,city,product_id,price,quantity,promo,total,category,month,dow
0,2023-01-01,Magasin_17,Marrakech,89,140.93,4,0,563.72,Bricolage,1,6
1,2023-01-01,Magasin_08,Fès,88,32.44,2,1,64.88,Technologie,1,6
2,2023-01-01,Magasin_20,Fès,42,35.78,4,0,143.12,Hygiène,1,6
3,2023-01-01,Magasin_19,Rabat,40,157.74,2,0,315.48,Textile,1,6
4,2023-01-01,Magasin_15,Agadir,45,175.49,4,0,701.96,Bricolage,1,6


## 1. Binning & Winsorization

In [2]:
# Binning des prix
df['price_bin'] = pd.cut(df['price'], bins=[0,20,50,100,200, np.inf], include_lowest=True)

# Winsorization simple (cap à P1/P99)
p1, p99 = df['total'].quantile([0.01, 0.99])
df['total_cap'] = df['total'].clip(lower=p1, upper=p99)
df[['total', 'total_cap']].head()


Unnamed: 0,total,total_cap
0,563.72,563.72
1,64.88,64.88
2,143.12,143.12
3,315.48,315.48
4,701.96,701.96


## 2. Encodage + Scaling via Pipeline

In [3]:
num_cols = ['price','quantity','month','dow']
cat_cols = ['category']

preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

pipe = Pipeline([('prep', preprocess)])
X = df[num_cols + cat_cols]
Xt = pipe.fit_transform(X)
Xt.shape


(2000, 10)

## 3. EXERCICE
- Créez une variable `is_weekend` (1 si samedi/dimanche; 0 sinon) et mesurez son impact sur `total` (moyenne par groupe).
- Créez `price_per_unit = total / quantity` (attention aux divisions par 0).
- Ajoutez un encodage One-Hot pour la ville et refaites le `ColumnTransformer`.

### 1)

In [4]:
# Ajouter une colonne indiquant si le jour est un week-end
df['is_weekend'] = df['dow'].isin([5,6]).astype(int)
df[['dow', 'is_weekend']].head()

Unnamed: 0,dow,is_weekend
0,6,1
1,6,1
2,6,1
3,6,1
4,6,1


In [5]:
impact_weekend = df.groupby('is_weekend')['total'].mean()
print("Impact du week-end sur le total des ventes (0: semaine, 1: week-end) :")
print(impact_weekend)

Impact du week-end sur le total des ventes (0: semaine, 1: week-end) :
is_weekend
0    472.901220
1    481.384487
Name: total, dtype: float64


### 2)

In [6]:
df['price_per_unit'] = np.where(df['quantity'] > 0, df['total'] / df['quantity'], 0)
df[['quantity', 'total', 'price_per_unit']].head()

Unnamed: 0,quantity,total,price_per_unit
0,4,563.72,140.93
1,2,64.88,32.44
2,4,143.12,35.78
3,2,315.48,157.74
4,4,701.96,175.49


### 3)

In [9]:
# La colonne 'city' est déjà présente dans le dataframe 'df' (issue de 'ventes')
# Nous n'avons pas besoin de fusionner avec 'clients' car il n'y a pas de 'client_id' dans 'ventes'

# Mise à jour des colonnes pour inclure les nouvelles variables
num_cols = ['price', 'quantity', 'month', 'dow', 'is_weekend', 'price_per_unit']
cat_cols = ['category', 'city']

# Nouveau ColumnTransformer incluant 'city' dans l'encodage One-Hot
preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Application du pipeline
pipe = Pipeline([('prep', preprocess)])
X = df[num_cols + cat_cols]
Xt = pipe.fit_transform(X)

print(f"Forme de la matrice transformée : {Xt.shape}")
print("Colonnes utilisées :", num_cols + cat_cols)

Forme de la matrice transformée : (2000, 21)
Colonnes utilisées : ['price', 'quantity', 'month', 'dow', 'is_weekend', 'price_per_unit', 'category', 'city']
