In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df_encoded is your dataframe with one-hot encoded features
df = pd.read_csv('../Amine/cleaned_tayara_final.csv')

df_encoded = pd.get_dummies(df, columns=['Type de bien', 'Localisation'])

# Convert boolean columns to integers (if applicable)
df_encoded = df_encoded.astype({col: int for col in df_encoded.select_dtypes(include=['bool']).columns})


df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

# Step 1: Prepare the features and target
X = df_encoded.drop(columns=['Prix'])  # Features
y = df_encoded['Prix']  # Target

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 4: Train the model
rf_model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = rf_model.predict(X_test)

# Evaluate the performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")


# Step 6: Make predictions (for new data, if you have any)
new_data = {
    'Superficie': [200],  # Square meters
    'Nb_Salles De bain': [2],  # Number of bathrooms
    'Nb_Chambres': [4],  # Number of bedrooms
    'Type de bien_Appartement': [0],  # 0 for "Appartement"
    'Type de bien_Maison': [0],  # 1 for "Maison"
    'Type de bien_villa': [1],  # 0 for "Villa"
    'Localisation_Ariana': [0],
    'Localisation_Ben_Arous': [1],
    'Localisation_Bizerte': [0],
    'Localisation_Bja': [0],
    'Localisation_Gabs': [0],
    'Localisation_Gafsa': [0],
    'Localisation_Jendouba': [0],
    'Localisation_Kairouan': [0],
    'Localisation_Kasserine': [0],
    'Localisation_La_Manouba': [0],
    'Localisation_Le_Kef': [0],
    'Localisation_Mahdia': [0],
    'Localisation_Mdenine': [0],
    'Localisation_Monastir': [0],
    'Localisation_Nabeul': [0],
    'Localisation_Sfax': [0],
    'Localisation_Sidi_Bouzid': [0],
    'Localisation_Siliana': [0],
    'Localisation_Sousse': [0],
    'Localisation_Tozeur': [0],
    'Localisation_Tunis': [0],  # 1 for "Tunis"
    'Localisation_Zaghouan': [0]
}
predicted = rf_model.predict(pd.DataFrame(new_data))
print(f"Predicted price: {predicted[0]:.2f} TND")
# predictions = rf_model.predict(new_data)


Mean Absolute Error (MAE): 139023.91
Mean Squared Error (MSE): 38153546786.69
Predicted price: 483437.57 TND


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm

X = df_encoded.drop(['Prix'], axis=1)
y = df_encoded['Prix']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

X = sm.add_constant(X)  
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Prix   R-squared:                       0.574
Model:                            OLS   Adj. R-squared:                  0.566
Method:                 Least Squares   F-statistic:                     71.07
Date:                Thu, 19 Dec 2024   Prob (F-statistic):          3.47e-232
Time:                        01:22:07   Log-Likelihood:                -18863.
No. Observations:                1396   AIC:                         3.778e+04
Df Residuals:                    1369   BIC:                         3.792e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [31]:
def predict_prix(superficie, nb_salles_de_bain, nb_chambres, type_bien, localisation):
    # Coefficients from the OLS regression results
    const = 1.027e+05
    coef_superficie = 258.1053
    coef_sdb = 4.932e+04
    coef_chambres = 2.119e+04

    # Dummy variables for property type (Appartement, Maison, Villa)
    coef_type_appartement = -3.89e+04
    coef_type_maison = -4.91e+04
    coef_type_villa = 1.907e+05

    # Coefficients for localisation (you can add more as needed)
    localisations = {
        "Ariana": 7.158e+04,
        "Ben Arous": 1.601e+04,
        "Bizerte": -8355.8896,
        "Bja": 1.51e+05,
        "Gabs": -1.407e+05,
        "Gafsa": -1.63e+05,
        "Nabeul": 1.187e+05,
        "Sousse": 1.146e+05,
        "Tunis": 7.635e+04,
        "Tozeur": 3.565e+05,
        "Sidi Bouzid": -5.53e+04,
        "Siliana": 1.211e+05,
        "Sfax": -3.256e+04,
        "Monastir": -1.834e+05,
        "Mdenine": -6.167e+04,
        "Mahdia": 1.986e+04,
        "Le Kef": -1.934e+05,
        "La Manouba": -5.263e+04,
        "Kasserine": 5.63e+04,
        "Kairouan": -1.181e+04,
        "Jendouba": -2.138e+04
        
    }

    # Start with the constant
    prix = const

    # Add the effect of each predictor
    prix += coef_superficie * superficie
    prix += coef_sdb * nb_salles_de_bain
    prix += coef_chambres * nb_chambres

    # Handle the type of property
    if type_bien == "Appartement":
        prix += coef_type_appartement
    elif type_bien == "Maison":
        prix += coef_type_maison
    elif type_bien == "Villa":
        prix += coef_type_villa

    # Add the effect of localisation
    if localisation in localisations:
        prix += localisations[localisation]
    else:
        print(f"Localisation {localisation} not recognized, no adjustment made.")

    return prix

# Example usage:
predicted_price = predict_prix(superficie=150, nb_salles_de_bain=2, nb_chambres=3, type_bien="Appartement", localisation="Tunis")
print(f"Predicted price: {predicted_price:.2f} TND")

Predicted price: 341075.79 TND


In [None]:
# Try degree=2 or higher for more complexity
poly = PolynomialFeatures(degree=4, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Step 4: Optionally scale the polynomial features (important for regularization later)
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

# Step 5: Fit a linear regression model with the polynomial features
model = LinearRegression()
model.fit(X_train_poly_scaled, y_train)

# Step 6: Make predictions
y_train_pred = model.predict(X_train_poly_scaled)
y_test_pred = model.predict(X_test_poly_scaled)

# Step 7: Evaluate the model
print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("R-squared (Training):", r2_score(y_train, y_train_pred))
print("R-squared (Test):", r2_score(y_test, y_test_pred))

In [3]:
df = pd.read_csv('../Amine/cleaned_tayara_final.csv')
df.isnull().sum()


Localisation         0
Type de bien         0
Superficie           0
Nb_Salles De bain    0
Nb_Chambres          0
Prix                 0
dtype: int64

In [5]:
from sklearn.preprocessing import OneHotEncoder

s = (df.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
print('No. of. categorical features: ', 
      len(object_cols))

Categorical variables:
['Localisation', 'Type de bien']
No. of. categorical features:  2


In [6]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[object_cols]))
OH_cols.index = df.index
OH_cols.columns = OH_encoder.get_feature_names_out()
df_final = df.drop(object_cols, axis=1)
df_final = pd.concat([df_final, OH_cols], axis=1)


In [7]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

X = df_final.drop(['Prix'], axis=1)
Y = df_final['Prix']

# Split the training set into 
# training and validation set
X_train, X_valid, Y_train, Y_valid = train_test_split(
    X, Y, train_size=0.8, test_size=0.2, random_state=0)

In [8]:
#svm
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_percentage_error

model_SVR = svm.SVR()
model_SVR.fit(X_train,Y_train)
Y_pred = model_SVR.predict(X_valid)

print(mean_absolute_percentage_error(Y_valid, Y_pred))

0.6695806433450651


In [9]:
from sklearn.ensemble import RandomForestRegressor

model_RFR = RandomForestRegressor(n_estimators=10)
model_RFR.fit(X_train, Y_train)
Y_pred = model_RFR.predict(X_valid)

mean_absolute_percentage_error(Y_valid, Y_pred)

0.36044972885913956

In [10]:
from sklearn.linear_model import LinearRegression

model_LR = LinearRegression()
model_LR.fit(X_train, Y_train)
Y_pred = model_LR.predict(X_valid)

print(mean_absolute_percentage_error(Y_valid, Y_pred))

0.30578248185297796


In [14]:
# Catboost

from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

cb_model = CatBoostRegressor()
cb_model.fit(X_train, Y_train)
preds = cb_model.predict(X_valid) 

cb_r2_score = r2_score(Y_valid, preds)
cb_r2_score


Learning rate set to 0.041659
0:	learn: 270243.9410242	total: 744us	remaining: 744ms
1:	learn: 264982.3609828	total: 1.64ms	remaining: 817ms
2:	learn: 259962.4897983	total: 2.66ms	remaining: 885ms
3:	learn: 254952.4966659	total: 3.55ms	remaining: 883ms
4:	learn: 250443.9349025	total: 4.63ms	remaining: 921ms
5:	learn: 246095.7541889	total: 5.75ms	remaining: 952ms
6:	learn: 241961.1187710	total: 6.92ms	remaining: 982ms
7:	learn: 238021.6991938	total: 7.88ms	remaining: 978ms
8:	learn: 234499.7793675	total: 8.91ms	remaining: 981ms
9:	learn: 231036.7169649	total: 9.9ms	remaining: 980ms
10:	learn: 227825.6548791	total: 10.9ms	remaining: 981ms
11:	learn: 224908.0447108	total: 11.8ms	remaining: 976ms
12:	learn: 222018.7396072	total: 12.9ms	remaining: 978ms
13:	learn: 219518.8553309	total: 13.9ms	remaining: 978ms
14:	learn: 217033.1874037	total: 14.9ms	remaining: 976ms
15:	learn: 214829.7903227	total: 15.7ms	remaining: 968ms
16:	learn: 212944.7439407	total: 16.3ms	remaining: 943ms
17:	learn: 21

0.5998955700742011

In [17]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

linear_reg = LinearRegression()
linear_reg.fit(X_train_scaled, y_train)

y_pred_linear = linear_reg.predict(X_valid_scaled)

mape_linear = mean_absolute_percentage_error(Y_valid, y_pred_linear)
print(f"MAPE for Multivariate Regression: {mape_linear * 100:.2f}%")


MAPE for Multivariate Regression: 65.49%


In [20]:

for degree in range(2, 5):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_valid_poly = poly.transform(X_valid_scaled)

    poly_reg = LinearRegression()
    poly_reg.fit(X_train_poly, y_train)

    y_pred_poly = poly_reg.predict(X_valid_poly)

    mape_poly = mean_absolute_percentage_error(Y_valid, y_pred_poly)
    print(f"MAPE for Polynomial Regression (degree={degree}): {mape_poly * 100:.2f}%")

MAPE for Polynomial Regression (degree=2): 3015393678487.27%
MAPE for Polynomial Regression (degree=3): 128305429572.22%
MAPE for Polynomial Regression (degree=4): 2780106803269.84%
