In [665]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor


In [666]:
df = pd.read_csv("./data/clean_data_with_region.csv")

In [667]:
y = df[['price']]
X = df.drop(columns=['price'])
X.shape

(9393, 24)

In [668]:
# delete all columns with more than 30% missing values
for column in X:
    if X[column].isnull().sum(axis = 0) > len(X) * 0.3:
        X = X.drop(columns=[column])

In [669]:
# delete non relevant columns

X = X.drop(columns=['Property ID', 'Locality name', 'Energy class', 'region'])

In [670]:
X.columns

Index(['Postal code', 'Type of property', 'Construction year',
       'Number of rooms', 'Living area', 'kitchen', 'State of builing',
       'Primary energy consumption', 'Heating type', 'Double glazing'],
      dtype='object')

In [671]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Postal code                 9393 non-null   int64  
 1   Type of property            9393 non-null   object 
 2   Construction year           6710 non-null   float64
 3   Number of rooms             9318 non-null   float64
 4   Living area                 8955 non-null   float64
 5   kitchen                     7393 non-null   float64
 6   State of builing            7792 non-null   object 
 7   Primary energy consumption  7877 non-null   float64
 8   Heating type                6753 non-null   object 
 9   Double glazing              7473 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 734.0+ KB


In [672]:
# Impute missing data
imputer_most_frequent = SimpleImputer(strategy='most_frequent')
columns_to_impute_most_frequent = ['Number of rooms', 'kitchen', 'State of builing', 'Heating type', 'Double glazing']
imputer_mean = SimpleImputer(strategy='mean')
columns_to_impute_mean = ['Construction year', 'Living area', 'Primary energy consumption']

X[columns_to_impute_most_frequent] = imputer_most_frequent.fit_transform(X[columns_to_impute_most_frequent])
X[columns_to_impute_mean] = imputer_mean.fit_transform(X[columns_to_impute_mean])

In [673]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Postal code                 9393 non-null   int64  
 1   Type of property            9393 non-null   object 
 2   Construction year           9393 non-null   float64
 3   Number of rooms             9393 non-null   object 
 4   Living area                 9393 non-null   float64
 5   kitchen                     9393 non-null   object 
 6   State of builing            9393 non-null   object 
 7   Primary energy consumption  9393 non-null   float64
 8   Heating type                9393 non-null   object 
 9   Double glazing              9393 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 734.0+ KB


In [674]:
# transform the State of building column to ordinal data
categorie = [['To restore', 'To renovate', 'To be done up', 'Good', 'Just renovated', 'As new']]

encoder = OrdinalEncoder(categories=categorie)
X['State_encoded'] = encoder.fit_transform(X[['State of builing']])

X = X.drop(columns=['State of builing'])

In [675]:
# create encoder object
enc = OneHotEncoder(sparse_output=False, drop='first').set_output(transform="pandas")

# apply fit method to the data frame

encoded_data = enc.fit_transform(X[['Heating type', 'Type of property']])

X = pd.concat([X.drop(columns=['Heating type', 'Type of property']).reset_index(drop=True), encoded_data.reset_index(drop=True)], axis=1)

In [676]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9393 entries, 0 to 9392
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Postal code                 9393 non-null   int64  
 1   Construction year           9393 non-null   float64
 2   Number of rooms             9393 non-null   object 
 3   Living area                 9393 non-null   float64
 4   kitchen                     9393 non-null   object 
 5   Primary energy consumption  9393 non-null   float64
 6   Double glazing              9393 non-null   object 
 7   State_encoded               9393 non-null   float64
 8   Heating type_Electric       9393 non-null   float64
 9   Heating type_Fuel oil       9393 non-null   float64
 10  Heating type_Gas            9393 non-null   float64
 11  Heating type_Pellet         9393 non-null   float64
 12  Heating type_Solar          9393 non-null   float64
 13  Heating type_Wood           9393 

In [677]:
X.columns

Index(['Postal code', 'Construction year', 'Number of rooms', 'Living area',
       'kitchen', 'Primary energy consumption', 'Double glazing',
       'State_encoded', 'Heating type_Electric', 'Heating type_Fuel oil',
       'Heating type_Gas', 'Heating type_Pellet', 'Heating type_Solar',
       'Heating type_Wood', 'Type of property_house'],
      dtype='object')

In [678]:
'''X = X.drop(columns=['Heating type_Electric', 'Heating type_Fuel oil',
       'Heating type_Gas', 'Heating type_Pellet', 'Heating type_Solar',
       'Heating type_Wood','Double glazing','kitchen'])'''

"X = X.drop(columns=['Heating type_Electric', 'Heating type_Fuel oil',\n       'Heating type_Gas', 'Heating type_Pellet', 'Heating type_Solar',\n       'Heating type_Wood','Double glazing','kitchen'])"

In [679]:
X['Number of rooms'] = X['Number of rooms'].astype('int')

In [680]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0)

In [681]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [682]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
# train and test the linear model
train_score_linear = regressor.score(X_train, y_train)
test_score_linear = regressor.score(X_test, y_test)
print("LinearRegression Training Score:", train_score_linear)
print("LinearRegression Test Score:", test_score_linear)

y_pred_linear = regressor.predict(X_test)

mae_linear = mean_absolute_error(y_test, y_pred_linear)
print("Mean Absolute Error on Test Set (MAE):", mae_linear)

LinearRegression Training Score: 0.2390977323527772
LinearRegression Test Score: 0.23778081714711485
Mean Absolute Error on Test Set (MAE): 148245.9947995116


In [684]:
# Initialize the RandomForestRegressor model
rf_model = RandomForestRegressor(
    n_estimators=100,     # Number of trees in the forest
    max_depth=10,         # Maximum depth of each tree
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Evaluate the model
train_score = rf_model.score(X_train, y_train)
test_score = rf_model.score(X_test, y_test)
print("Random Forest Training Score:", train_score)
print("Random Forest Test Score:", test_score)

# Make predictions using the trained model
y_pred = rf_model.predict(X_test)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error on Test Set (MAE):", mae)

  return fit_method(estimator, *args, **kwargs)


Random Forest Training Score: 0.895983276848868
Random Forest Test Score: 0.6568423654702393
Mean Absolute Error on Test Set (MAE): 92252.37020141377


In [685]:
# Initialize the XGBRegressor model
xgb_model = XGBRegressor(
    objective='reg:squarederror',  # Use square error for regression
    n_estimators=100,              # Number of boosting rounds
    learning_rate=0.2,             # Learning rate
    max_depth=4,                   # Maximum depth of a tree
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Evaluate the model
train_score = xgb_model.score(X_train, y_train)
test_score = xgb_model.score(X_test, y_test)
print("XGBoost Training Score:", train_score)
print("XGBoost Test Score:", test_score)

# Make predictions using the trained model
y_pred_xgb = xgb_model.predict(X_test)

# Calculate Mean Absolute Error
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("Mean Absolute Error on Test Set (MAE):", mae_xgb)

XGBoost Training Score: 0.8484861254692078
XGBoost Test Score: 0.7346958518028259
Mean Absolute Error on Test Set (MAE): 87485.03473423363
