Imports & chargement

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

DATA_PATH = Path("..") / "data" / "clean_data" / "location_all_sources.csv"
df = pd.read_csv(DATA_PATH, encoding="utf-8-sig")

df.head()


Unnamed: 0,ville,prix,surface,quartier,type_bien,nb_chambres,nb_salle_de_bain,url_annonce,source
0,Casablanca,20000.0,48.0,Maarif,Appartement,5,1,https://www.avito.ma/fr/maarif/appartements/__...,Avito
1,Tanger,6500.0,90.0,Castilla,Appartement,4,1,https://www.avito.ma/fr/castilla/appartements/...,Avito
2,Casablanca,17000.0,155.0,Racine,Appartement,3,1,https://www.avito.ma/fr/racine/appartements/CM...,Avito
3,Casablanca,2300.0,56.0,Oulfa,Appartement,3,1,https://www.avito.ma/fr/oulfa/appartements/App...,Avito
4,Casablanca,5500.0,90.0,Route d'Azemmour,Appartement,3,1,https://www.avito.ma/fr/route_d_azemmour/appar...,Avito


Normalisation

In [2]:
def normalize_city(v):
    if pd.isna(v):
        return None
    v = str(v).strip().lower()
    mapping = {
        "casablanca": "Casablanca", "الدار البيضاء": "Casablanca",
        "rabat": "Rabat", "الرباط": "Rabat",
        "marrakech": "Marrakech", "مراكش": "Marrakech",
        "tanger": "Tanger", "طنجة": "Tanger"
    }
    return mapping.get(v, v.title())

df["ville"] = df["ville"].apply(normalize_city)


Préparation ML

In [4]:
features = [
    "ville",
    "type_bien",
    "surface",
    "nb_chambres",
    "nb_salle_de_bain",
    "source"          #  NOUVELLE VARIABLE
]

target = "prix"

df_ml = df[features + [target]].copy()

# Conversion numérique
for c in ["surface", "nb_chambres", "nb_salle_de_bain", "prix"]:
    df_ml[c] = pd.to_numeric(df_ml[c], errors="coerce")

df_ml = df_ml.dropna().reset_index(drop=True)


Filtres + log(prix)

In [5]:
# Filtres métier
df_ml = df_ml[df_ml["prix"].between(500, 100_000)]

df_ml["prix_m2"] = df_ml["prix"] / df_ml["surface"]
df_ml = df_ml[df_ml["prix_m2"].between(10, 2000)]

# Outliers statistiques
q1, q99 = df_ml["prix"].quantile([0.01, 0.99])
df_ml = df_ml[df_ml["prix"].between(q1, q99)]

# Target
df_ml["log_prix"] = np.log1p(df_ml["prix"])


Split

In [6]:
X = df_ml[features]
y = df_ml["log_prix"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Preprocessing

In [7]:
num_features = ["surface", "nb_chambres", "nb_salle_de_bain"]
cat_features = ["ville", "type_bien", "source"]

preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)


Modèles & évaluation

In [8]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

results = []

for name, model in models.items():
    pipe = Pipeline([("prep", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    results.append({
        "Model": name,
        "R2_log": r2_score(y_test, y_pred),
        "RMSE_log": mean_squared_error(y_test, y_pred) ** 0.5
    })

results_df_multi = pd.DataFrame(results).sort_values("R2_log", ascending=False)
results_df_multi


Unnamed: 0,Model,R2_log,RMSE_log
3,GradientBoosting,0.666564,0.388366
2,RandomForest,0.653399,0.395958
1,Ridge,0.487496,0.481485
0,LinearRegression,0.487489,0.481488
