In [29]:
import pandas as pd
import numpy as np

# Load Data
file_path = 'tayara_v3.csv' 
df = pd.read_csv(file_path)

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

df.loc[(df['Nature'].isnull()) & (df['Prix'] > 100000), 'Nature'] = 'À Vendre'

# Step 4: Check missing values after the update (optional, for verification)
print("Missing values after:", df['Nature'].isnull().sum())

Missing values per column:
Région                     0
Localisation            2070
Type de bien            1319
Nature                   105
Superficie               335
Nb_Salles De bain        315
Nb_Chambres              221
Prix                      15
Date_De_Modification    2240
Texte Annonce              0
dtype: int64
Missing values after: 31


In [30]:
# Step 2: Check missing values for 'Type de bien' (optional, for verification)
print("Missing values before:", df['Type de bien'].isnull().sum())

# Convert 'Nb_Chambres' to numeric, forcing errors to NaN
df['Nb_Chambres'] = pd.to_numeric(df['Nb_Chambres'], errors='coerce')

# Step 3: Fill missing values in 'Type de bien' based on the given rules
df.loc[(df['Type de bien'].isnull()) & 
       (df['Nb_Chambres'].between(1, 3)) & 
       (df['Nb_Salles De bain'].between(1, 2)), 'Type de bien'] = 'Appartement'

df.loc[(df['Type de bien'].isnull()) & 
       (df['Nb_Chambres'].between(3, 4)) & 
       (df['Nb_Salles De bain'].between(1, 2)), 'Type de bien'] = 'Maison'

df.loc[(df['Type de bien'].isnull()) & 
    (df['Nb_Chambres'] > 4) & 
    (df['Nb_Salles De bain'] >= 2), 'Type de bien'] = 'Villa'

# Step 4: Check missing values after the update (optional, for verification)
print("Missing values after:", df['Type de bien'].isnull().sum())

Missing values before: 1319
Missing values after: 295


In [35]:

from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# 1️⃣ Load Data
file_path = 'tayara_v3.csv' 
df = pd.read_csv(file_path)

# 2️⃣ Drop 'Date_De_Modification'
df.drop(columns=['Date_De_Modification'], inplace=True)

# 3️⃣ Handle Missing Values for 'Localisation' using KNN
df['Localisation_num'] = df['Localisation'].astype('category').cat.codes  # Convert 'Localisation' to numeric
localisation_imputer = KNNImputer(n_neighbors=5)
df['Localisation_num'] = localisation_imputer.fit_transform(df[['Localisation_num']])
localisation_mapping = dict(enumerate(df['Localisation'].astype('category').cat.categories))
df['Localisation'] = df['Localisation_num'].round().astype(int).map(localisation_mapping)
df.drop(columns=['Localisation_num'], inplace=True)

# 4️⃣ Handle Missing Values for 'Superficie'
df['Superficie'] = df.groupby('Type de bien')['Superficie'].transform(lambda x: x.fillna(x.median()))


df['Nb_Chambres'] = pd.to_numeric(df['Nb_Chambres'], errors='coerce')

# 5️⃣ Handle Missing Values for 'Nb_Chambres'
knn_imputer = KNNImputer(n_neighbors=5)
df[['Nb_Chambres']] = knn_imputer.fit_transform(df[['Nb_Chambres']])

# 6️⃣ Handle Missing Values for 'Nb_Salles De bain'
df['Nb_Salles De bain'] = df.groupby(['Type de bien', 'Localisation'])['Nb_Salles De bain'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x.fillna(1))

# 7️⃣ Drop rows where 'Type de bien' or 'Localisation' are missing
df.dropna(subset=['Type de bien', 'Localisation'], inplace=True)

# 🔍 Recheck for NaNs in critical columns
print(df["Type de bien"].count())
print(df['Type de bien'].value_counts())
print(df[['Superficie', 'Nb_Chambres', 'Nb_Salles De bain', 'Type de bien', 'Localisation']].isnull().sum())

# 8️⃣ Prepare data for regression
X = df.dropna(subset=['Prix'])[['Superficie', 'Nb_Chambres', 'Nb_Salles De bain', 'Type de bien', 'Localisation']]
y = df.dropna(subset=['Prix'])['Prix']

# 9️⃣ Convert categorical columns to numeric (Type de bien, Localisation)
X = pd.get_dummies(X, columns=['Type de bien', 'Localisation'], drop_first=True)

# 🔟 Train the regression model
regressor = LinearRegression()
regressor.fit(X, y)

# 1️⃣1️⃣ Prepare X_missing_prix and predict missing 'Prix'
X_missing_prix = df.loc[df['Prix'].isnull(), ['Superficie', 'Nb_Chambres', 'Nb_Salles De bain', 'Type de bien', 'Localisation']]

# Encode categorical features (Type de bien, Localisation) into dummies
X_missing_prix = pd.get_dummies(X_missing_prix, columns=['Type de bien', 'Localisation'], drop_first=True)

# Fill NaN in X_missing_prix
X_missing_prix.fillna({
    'Superficie': df['Superficie'].median(), 
    'Nb_Chambres': df['Nb_Chambres'].median(), 
    'Nb_Salles De bain': df['Nb_Salles De bain'].mode().iloc[0]
}, inplace=True)

# Align columns to match the training data
X_missing_prix = X_missing_prix.reindex(columns=X.columns, fill_value=0)

# Predict missing 'Prix'
predicted_prix = regressor.predict(X_missing_prix)

# Update missing prices in the dataframe
df.loc[df['Prix'].isnull(), 'Prix'] = predicted_prix

# ✅ Final Cleaned Data
print("All missing data handled successfully.")


# 🔥 Final Cleaned Data
df.reset_index(drop=True, inplace=True)

# 🗂️ Save cleaned data to CSV
df.to_csv('cleaned_tayara.csv', index=False)



1589
Type de bien
villa          1072
Maison          481
Appartement      36
Name: count, dtype: int64
Superficie           0
Nb_Chambres          0
Nb_Salles De bain    0
Type de bien         0
Localisation         0
dtype: int64
All missing data handled successfully.
