step 1 : Importation des bibliothèques et chargement des données

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
df = pd.read_csv("appartements-data-db-6872f0ba853ec096170787.csv")
df.head()

Unnamed: 0,title,price,city_name,salon,nb_rooms,nb_baths,surface_area,equipment,link
0,CMN-MA-1752 - Appartement à vendre à Palmier,2 000 000 DH,Casablanca,,2.0,2.0,168.0,Ascenseur/Balcon/Parking/Terrasse,https://www.avito.ma/fr/palmier/appartements/C...
1,66370-Vente Appt à Casablanca Hay Hassani de 1...,1 195 000 DH,Casablanca,,2.0,2.0,98.0,Ascenseur/Balcon/Chauffage/Climatisation/Cuisi...,https://www.avito.ma/fr/hay_hassani/appartemen...
2,Appartement à vendre 81 m² à Dar Bouazza,1 350 000 DH,Dar Bouazza,1.0,2.0,2.0,81.0,Ascenseur/Balcon/Chauffage/Climatisation/Conci...,https://www.avito.ma/fr/dar_bouazza/appartemen...
3,63860-Vente Appt à Casablanca Quartier Bd Med ...,900 000 DH,Casablanca,,1.0,1.0,56.0,Ascenseur/Chauffage/Climatisation/Cuisine Équi...,https://www.avito.ma/fr/centre_ville/apparteme...
4,Appartement à Rabat Agdal,3 100 000 DH,Rabat,2.0,3.0,2.0,200.0,Ascenseur/Balcon/Concierge/Parking/Sécurité,https://www.avito.ma/fr/agdal/appartements/App...


step 2 : Aperçu de la structure du dataset

In [2]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1773 entries, 0 to 1772
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1772 non-null   object 
 1   price         1490 non-null   object 
 2   city_name     1772 non-null   object 
 3   salon         1620 non-null   float64
 4   nb_rooms      1490 non-null   float64
 5   nb_baths      1480 non-null   float64
 6   surface_area  1742 non-null   float64
 7   equipment     1402 non-null   object 
 8   link          1773 non-null   object 
dtypes: float64(4), object(5)
memory usage: 124.8+ KB


Unnamed: 0,title,price,city_name,salon,nb_rooms,nb_baths,surface_area,equipment,link
count,1772,1490,1772,1620.0,1490.0,1480.0,1742.0,1402,1773
unique,1471,355,77,,,,,321,1732
top,appartement à vendre,850 000 DH,Casablanca,,,,,Ascenseur/Balcon/Chauffage/Climatisation/Conci...,https://www.avito.ma/vi/55595022.htm
freq,33,34,626,,,,,84,4
mean,,,,1.267284,2.379195,2.307432,174.93341,,
std,,,,0.557539,0.667159,7.629128,2969.500693,,
min,,,,0.0,1.0,0.0,1.0,,
25%,,,,1.0,2.0,1.0,71.0,,
50%,,,,1.0,2.0,2.0,89.0,,
75%,,,,1.0,3.0,2.0,114.75,,


step 3 : Suppression des doublons et détection des valeurs manquantes

In [3]:
df.isnull().sum()
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [4]:
df.price.dtype

dtype('O')

step 4 : Conversion de la colonne `price` en float

In [5]:
df["price"] = df["price"].replace('[\Dh\s,]', '', regex=True).astype(float)

  df["price"] = df["price"].replace('[\Dh\s,]', '', regex=True).astype(float)


step 5 Nettoyage et uniformisation de `city_name`

In [6]:
df["city_name"].fillna("Unknown", inplace=True)

replace_dict = {
    "الدار البيضاء": "Casablanca",
    "الرباط": "Rabat",
    "فاس": "Fès",
    "مراكش": "Marrakech",
    # Ajouter d'autres villes si nécessaire
}
df["city_name"] = df["city_name"].replace(replace_dict)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["city_name"].fillna("Unknown", inplace=True)


step 6 Transformation de `equipment` en colonnes booléennes

In [7]:
equipment_dummies = df["equipment"].str.get_dummies(sep="/")
df = pd.concat([df, equipment_dummies], axis=1)
df.drop(["equipment", "link"], axis=1, inplace=True)


step 7 Remplir les valeurs manquantes

In [8]:
for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].median(), inplace=True)

# 
for col in df.select_dtypes(include='object').columns:
    df[col].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

step 8 Suppression des valeurs aberrantes 

In [9]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

df = remove_outliers_iqr(df, "price")
df = remove_outliers_iqr(df, "surface_area")

step 9 Encodage de `city_name` et mise à l’échelle des variable

In [10]:
le = LabelEncoder()
df["city_name"] = le.fit_transform(df["city_name"])

scaler = StandardScaler()
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = scaler.fit_transform(df[num_cols])

Step 10  Définir les variables x (features) et y (target)

On définit :
- y comme variable cible = la colonne price
- x comme les variables explicatives (features), toutes les colonnes sauf price

In [11]:
y = df["price"]
X = df.drop("price", axis=1)

step 11  Séparation des données en train/test (80% / 20%)

Utilisation de train_test_split pour diviser les données en :
- `X_train`, `X_test` (features)
- `y_train`, `y_test` (target)

In [12]:
from sklearn.model_selection import train_test_split
X = X.select_dtypes(include=['number'])  

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

step 12 : Entraînement de plusieurs modèles de régression

Modèles testés :
. Régression Linéaire
. Random Forest Regressor
. SVR
. Gradient Boosting Regressor

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "SVR": SVR(),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model


step 13 Évaluation des performances avec des métriques

Métriques utilisées :
. MSE (Mean Squared Error)
. RMSE (Root Mean Squared Error)
. MAE (Mean Absolute Error)
. R² Score

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"🔹 {name}")
    print(f"  R²: {r2:.4f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print("-" * 30)

🔹 Linear Regression
  R²: 0.2817
  RMSE: 0.81
  MAE: 0.61
------------------------------
🔹 Random Forest
  R²: 0.4139
  RMSE: 0.73
  MAE: 0.49
------------------------------
🔹 SVR
  R²: 0.3036
  RMSE: 0.80
  MAE: 0.58
------------------------------
🔹 Gradient Boosting
  R²: 0.4149
  RMSE: 0.73
  MAE: 0.50
------------------------------


step 14 Validation croisée 


In [15]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X, y, scoring="r2", cv=5)
    print(f"🔄 {name} - Moyenne R²: {scores.mean():.4f}")


🔄 Linear Regression - Moyenne R²: 0.2709
🔄 Random Forest - Moyenne R²: 0.3269


KeyboardInterrupt: 

step 15 Optimisation avec GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid.fit(X_train, y_train)

print("✅ Best parameters:", grid.best_params_)
print("📈 Best R² score:", grid.best_score_)


✅ Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 150}
📈 Best R² score: 0.34708376986549705


step 16 Sauvegarde du meilleur modèle (avec pickle)

In [None]:
import pickle

best_model = grid.best_estimator_
with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)


step 17 Charger le modèle sauvegardé (`model.pkl`)

In [None]:
import pickle

with open("model.pkl", "rb") as f:
    model = pickle.load(f)

step 18  Créer une fonction de prédiction  "predict_price(data)"

Cette fonction :
- Prend en entrée un dictionnaire contenant les caractéristiques d'un appartement
- Le transforme en DataFrame
- Applique le même prétraitement (LabelEncoding, scaling, etc.)
- Puis retourne le prix prédit

In [None]:
import pandas as pd

def predict_price(input_data):
    # Exemple: input_data = {"surface_area": 80, "rooms": 3, "city_name": "Casablanca", ...}
    
    # Transformer en DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Appliquer les mêmes transformations qu'avant (attention aux encodages!)
    # Exemple: encoder la ville si vous l’avez utilisée dans votre modèle
    # Assurez-vous que les colonnes correspondent exactement à celles utilisées pour l’entraînement
    
    # Pour simplifier : on suppose ici que input_df est déjà prêt (avec mêmes colonnes et scaling)
    
    prediction = model.predict(input_df)
    return prediction[0]

step 19  Tester la fonction avec des données exemples

On simule l'entrée d'un utilisateur (par exemple dans une application web) et on affiche le prix prédit.

In [None]:
sample = {
    "city_name": 4,  # déjà encodé
    "salon": 0,      # valeur par défaut
    "nb_rooms": 3,
    "nb_baths": 2,
    "surface_area": 90,
    "Ascenseur": 0,
    "Balcon": 0,
    "Chauffage": 0,
    "Climatisation": 0,
    "Concierge": 0,
    "Cuisine Équipée": 0,
    "Duplex": 0,
    "Meublé": 0,
    "Parking": 0,
    "Sécurité": 0,
    "Terrasse": 0
}

predicted_price = predict_price(sample)
print(f"💰 Prix estimé : {predicted_price:.2f} MAD")

step 20  Afficher un graphique : Prédiction    Versus     Réalité

Permet de visualiser la performance globale du modèle

In [None]:
import matplotlib.pyplot as plt

y_pred = model.predict(X_test)

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color="teal")
plt.xlabel("Prix réel")
plt.ylabel("Prix prédit")
plt.title("📊 Comparaison : Réel vs Prédit")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.grid(True)
plt.show()
