# Immoeliza Regression Project

## Data cleaning

### Import Necessary Librairies

In [1465]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = 45            #just extending the number  of columns that can be seen
pd.options.display.max_colwidth = 120  

# pd.options.mode.chained_assignment = None

### Setting the dataset in a variable called "data"

In [1466]:
# load json as a dataframe with pandas
data = pd.read_json("data/final_dataset.json")

In [1467]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118714 entries, 2 to 181792
Data columns (total 32 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Url                118714 non-null  object 
 1   BathroomCount      109112 non-null  float64
 2   BedroomCount       118714 non-null  int64  
 3   ConstructionYear   68898 non-null   float64
 4   Country            118714 non-null  object 
 5   District           118707 non-null  object 
 6   Fireplace          4123 non-null    float64
 7   FloodingZone       58342 non-null   object 
 8   Furnished          30325 non-null   float64
 9   Garden             22503 non-null   float64
 10  GardenArea         22503 non-null   float64
 11  Kitchen            67134 non-null   object 
 12  LivingArea         104539 non-null  float64
 13  Locality           118712 non-null  object 
 14  MonthlyCharges     13650 non-null   float64
 15  NumberOfFacades    76942 non-null   float64
 16  PEB    

In [1468]:
data.groupby('TypeOfProperty')['TypeOfProperty'].agg('count')

TypeOfProperty
1    58234
2    60480
Name: TypeOfProperty, dtype: int64

In [1469]:
data.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Url,118714.0,118714.0,https://www.immoweb.be/en/classified/apartment/for-rent/woluwe-saint-lambert/1200/11370116,1.0,,,,,,,
BathroomCount,109112.0,,,,1.232898,1.092045,0.0,1.0,1.0,1.0,145.0
BedroomCount,118714.0,,,,2.708383,1.855343,0.0,2.0,3.0,3.0,200.0
ConstructionYear,68898.0,,,,1987.345496,47.311922,1753.0,1964.0,2000.0,2023.0,8071.0
Country,118714.0,1.0,Belgium,118714.0,,,,,,,
District,118707.0,43.0,Brussels,14098.0,,,,,,,
Fireplace,4123.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
FloodingZone,58342.0,9.0,NON_FLOOD_ZONE,55998.0,,,,,,,
Furnished,30325.0,,,,0.070041,0.255221,0.0,0.0,0.0,0.0,1.0
Garden,22503.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [1470]:
data.drop_duplicates("PropertyId",inplace=True)
data.drop_duplicates("Url",inplace=True)

In [1471]:
data.update(data[["BathroomCount","Fireplace","Furnished","Garden","GardenArea","SwimmingPool","Terrace","ToiletCount"]].fillna(0))

In [1472]:
sale_data = data[data.TypeOfSale == "residential_sale"]

In [1473]:
sale_data = sale_data.drop(["Url", "MonthlyCharges", "TypeOfSale"], axis=1)
sale_data.dropna(axis=0, inplace=True, subset=['LivingArea'])

In [1474]:
sale_data.drop(data[data.ConstructionYear > 2033].index,inplace=True)

In [1475]:
sale_data.isnull().sum()

BathroomCount            0
BedroomCount             0
ConstructionYear     35549
Country                  0
District                 6
Fireplace                0
FloodingZone         45116
Furnished                0
Garden                   0
GardenArea               0
Kitchen              39728
LivingArea               0
Locality                 2
NumberOfFacades      31952
PEB                  26768
PostalCode               0
Price                    0
PropertyId               0
Province                 6
Region                   6
RoomCount            68313
ShowerCount          48855
StateOfBuilding      29237
SubtypeOfProperty        0
SurfaceOfPlot        46393
SwimmingPool             0
Terrace                  0
ToiletCount              0
TypeOfProperty           0
dtype: int64

In [1476]:
sale_data.drop(sale_data[sale_data.ConstructionYear > 2033].index,inplace=True)
sale_data.drop(sale_data[sale_data.PostalCode < 1000].index,inplace=True)

In [1477]:
sale_data = sale_data.drop_duplicates()

In [1478]:
sale_data.isnull().sum()

BathroomCount            0
BedroomCount             0
ConstructionYear     35549
Country                  0
District                 6
Fireplace                0
FloodingZone         45116
Furnished                0
Garden                   0
GardenArea               0
Kitchen              39728
LivingArea               0
Locality                 2
NumberOfFacades      31952
PEB                  26768
PostalCode               0
Price                    0
PropertyId               0
Province                 6
Region                   6
RoomCount            68313
ShowerCount          48855
StateOfBuilding      29237
SubtypeOfProperty        0
SurfaceOfPlot        46393
SwimmingPool             0
Terrace                  0
ToiletCount              0
TypeOfProperty           0
dtype: int64

In [1479]:
sale_data.groupby('StateOfBuilding')['StateOfBuilding'].agg('count')

StateOfBuilding
AS_NEW            13203
GOOD              35627
JUST_RENOVATED     3707
TO_BE_DONE_UP      5033
TO_RENOVATE        6442
TO_RESTORE          379
Name: StateOfBuilding, dtype: int64

In [1480]:
keep_PEB = ['A++', 'A+', 'B', 'C', 'D', 'E', 'F', 'G']
sale_data = sale_data[sale_data['PEB'].isin(keep_PEB)]

In [1481]:
sale_data["SwimmingPool"] = sale_data["SwimmingPool"].astype(int)
sale_data["Terrace"] = sale_data["Terrace"].astype(int)
sale_data["LivingArea"] = sale_data["LivingArea"].astype(int)


In [1482]:
sale_data.groupby('StateOfBuilding')['StateOfBuilding'].agg('count')

StateOfBuilding
AS_NEW             8710
GOOD              20027
JUST_RENOVATED     3074
TO_BE_DONE_UP      4819
TO_RENOVATE        6037
TO_RESTORE          309
Name: StateOfBuilding, dtype: int64

In [1483]:
X = sale_data[["BedroomCount", "LivingArea", "SwimmingPool", "StateOfBuilding", "Terrace", "PEB", "SubtypeOfProperty"]]
y = pd.DataFrame()
y['Price'] = sale_data['Price'].values


In [1484]:
X.isnull().sum()

BedroomCount             0
LivingArea               0
SwimmingPool             0
StateOfBuilding      11371
Terrace                  0
PEB                      0
SubtypeOfProperty        0
dtype: int64

In [1485]:
sale_data.dropna(subset=['StateOfBuilding'], inplace=True)

In [1486]:
sale_data.groupby('StateOfBuilding')['StateOfBuilding'].agg('count')

StateOfBuilding
AS_NEW             8710
GOOD              20027
JUST_RENOVATED     3074
TO_BE_DONE_UP      4819
TO_RENOVATE        6037
TO_RESTORE          309
Name: StateOfBuilding, dtype: int64

In [1487]:
print(X.dtypes)
print(X.shape)
print(y.shape)
X.head()

BedroomCount          int64
LivingArea            int64
SwimmingPool          int64
StateOfBuilding      object
Terrace               int64
PEB                  object
SubtypeOfProperty    object
dtype: object
(54347, 7)
(54347, 1)


Unnamed: 0,BedroomCount,LivingArea,SwimmingPool,StateOfBuilding,Terrace,PEB,SubtypeOfProperty
6,13,391,0,GOOD,0,D,apartment_block
8,4,111,0,GOOD,0,B,house
11,2,92,0,AS_NEW,1,B,apartment
14,1,50,0,AS_NEW,1,E,apartment
24,2,73,0,GOOD,0,C,apartment


In [1488]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54347 entries, 6 to 181791
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   BedroomCount       54347 non-null  int64 
 1   LivingArea         54347 non-null  int64 
 2   SwimmingPool       54347 non-null  int64 
 3   StateOfBuilding    42976 non-null  object
 4   Terrace            54347 non-null  int64 
 5   PEB                54347 non-null  object
 6   SubtypeOfProperty  54347 non-null  object
dtypes: int64(4), object(3)
memory usage: 3.3+ MB


In [1489]:
print(X.shape)
print(y.shape)

(54347, 7)
(54347, 1)


In [1490]:
# Get dummies for object type
X_columns_name = ['StateOfBuilding', 'SubtypeOfProperty', "PEB"]
X = pd.get_dummies(X, columns=X_columns_name, prefix=X_columns_name)

In [1491]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54347 entries, 6 to 181791
Data columns (total 42 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   BedroomCount                            54347 non-null  int64
 1   LivingArea                              54347 non-null  int64
 2   SwimmingPool                            54347 non-null  int64
 3   Terrace                                 54347 non-null  int64
 4   StateOfBuilding_AS_NEW                  54347 non-null  bool 
 5   StateOfBuilding_GOOD                    54347 non-null  bool 
 6   StateOfBuilding_JUST_RENOVATED          54347 non-null  bool 
 7   StateOfBuilding_TO_BE_DONE_UP           54347 non-null  bool 
 8   StateOfBuilding_TO_RENOVATE             54347 non-null  bool 
 9   StateOfBuilding_TO_RESTORE              54347 non-null  bool 
 10  SubtypeOfProperty_apartment             54347 non-null  bool 
 11  SubtypeOfProperty_a

In [1492]:
print(X.shape)
print(y.shape)

(54347, 42)
(54347, 1)


In [1502]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [1503]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [1504]:
reg = RandomForestRegressor(n_estimators=500, random_state=5)

In [1505]:
reg.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [1506]:
print(f"Score training House : {reg.score(X_train, y_train)}")

Score training House : 0.8990704783627396


In [1507]:
y_pred = reg.predict(X_test)

In [1508]:
print(f"Score test House : {reg.score(X_test, y_test)}")

Score test House : 0.49803310971912806


In [1509]:
new_data = [[4, 302, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]

In [1510]:
prediction = est.predict(new_data)
print("Prédiction:", prediction[0].round(0))

Prédiction: 1445540.0




In [None]:
####################################

In [884]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [686]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [687]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [688]:
regressor.score(X_train, y_train)

0.3330442728331785

In [689]:
regressor.predict(X_test)

array([[265238.97751025],
       [444963.72423767],
       [663933.49303111],
       ...,
       [557310.89432305],
       [225784.31238027],
       [355466.21010873]])

In [690]:
regressor.score(X_test, y_test)

0.35755638468382545

In [None]:
#################################

In [1416]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [1417]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=65)

In [1418]:
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [1419]:
est = GradientBoostingRegressor(n_estimators=1500, max_depth=4, min_samples_split=2, learning_rate=0.5, loss='huber')

In [1420]:
est.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


In [1421]:
print(f"Score training House : {est.score(X_train, y_train)}")

Score training House : 0.7076176893882237


In [1422]:
y_pred = est.predict(X_test)
print(f"Score test House : {est.score(X_test, y_test)}")

Score test House : 0.4623981119654721


In [1423]:
X_train.shape

(46194, 42)

In [1424]:
X_test.shape

(8153, 42)

In [1425]:
X_test.head()

Unnamed: 0,BedroomCount,LivingArea,SwimmingPool,Terrace,StateOfBuilding_AS_NEW,StateOfBuilding_GOOD,StateOfBuilding_JUST_RENOVATED,StateOfBuilding_TO_BE_DONE_UP,StateOfBuilding_TO_RENOVATE,StateOfBuilding_TO_RESTORE,SubtypeOfProperty_apartment,SubtypeOfProperty_apartment_block,SubtypeOfProperty_bungalow,SubtypeOfProperty_castle,SubtypeOfProperty_chalet,SubtypeOfProperty_country_cottage,SubtypeOfProperty_duplex,SubtypeOfProperty_exceptional_property,SubtypeOfProperty_farmhouse,SubtypeOfProperty_flat_studio,SubtypeOfProperty_ground_floor,SubtypeOfProperty_house,SubtypeOfProperty_kot,SubtypeOfProperty_loft,SubtypeOfProperty_manor_house,SubtypeOfProperty_mansion,SubtypeOfProperty_mixed_use_building,SubtypeOfProperty_other_property,SubtypeOfProperty_pavilion,SubtypeOfProperty_penthouse,SubtypeOfProperty_service_flat,SubtypeOfProperty_town_house,SubtypeOfProperty_triplex,SubtypeOfProperty_villa,PEB_A+,PEB_A++,PEB_B,PEB_C,PEB_D,PEB_E,PEB_F,PEB_G
39424,3,160,0,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True
27375,6,370,0,1,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
101926,3,95,0,1,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
83951,5,222,0,1,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
53304,3,225,1,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False


In [1426]:
new_data = [[4, 302, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]

In [1427]:
prediction = est.predict(new_data)
print("Prédiction:", prediction[0].round(0))

Prédiction: 1445540.0




In [278]:
# data.drop(data[data.BathroomCount > data.BedroomCount].index,inplace=True)
# data.drop(data[data.GardenArea > data.SurfaceOfPlot].index,inplace=True)
# data.drop(data[data.PostalCode < 1000].index,inplace=True)
# data.drop(data[data.NumberOfFacades > 4].index,inplace=True)
# data.drop(data[data.Price > 15000000].index,inplace=True)
# data.drop(data[data.ToiletCount > 58].index,inplace=True)
# data.drop(data[data.ShowerCount > 58].index,inplace=True)
# data.drop(data[data.LivingArea > 8800].index, inplace=True)

In [279]:
data.drop(data[data.ConstructionYear > 2033].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_monthly_amount"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_without_lump_sum"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_lump_sum"].index,inplace=True)
data.drop(data[data.TypeOfSale == "homes_to_build"].index,inplace=True)
data.drop(data[data.PostalCode < 1000].index,inplace=True)

In [281]:
data_test = sale_data[["LivingArea", "Price", "PEB", "BedroomCount"]]

In [282]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 104946 entries, 2 to 181791
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   LivingArea    93628 non-null   float64
 1   Price         104946 non-null  int64  
 2   PEB           74404 non-null   object 
 3   BedroomCount  104946 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 4.0+ MB


In [283]:
data_test.dropna(axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test.dropna(axis=0, inplace=True)


In [284]:
keep_PEB = ['A++', 'A+', 'B', 'C', 'D', 'E', 'F', 'G']
data_test = data_test[data_test['PEB'].isin(keep_PEB)]
data_test.groupby('PEB')['PEB'].agg('count')

PEB
A+      1685
A++      544
B      13779
C      11347
D       9196
E       6137
F       7687
G       3972
Name: PEB, dtype: int64

In [285]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")
ohetransform = ohe.fit_transform(data_test[['PEB']])
data_test = pd.concat([data_test,ohetransform],axis=1).drop(columns=["PEB"])
data_test.head()

Unnamed: 0,LivingArea,Price,BedroomCount,PEB_A+,PEB_A++,PEB_B,PEB_C,PEB_D,PEB_E,PEB_F,PEB_G
6,391.0,765000,13,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,111.0,399000,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11,92.0,198000,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
14,50.0,215000,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
24,73.0,360000,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [286]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54347 entries, 6 to 181791
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LivingArea    54347 non-null  float64
 1   Price         54347 non-null  int64  
 2   BedroomCount  54347 non-null  int64  
 3   PEB_A+        54347 non-null  float64
 4   PEB_A++       54347 non-null  float64
 5   PEB_B         54347 non-null  float64
 6   PEB_C         54347 non-null  float64
 7   PEB_D         54347 non-null  float64
 8   PEB_E         54347 non-null  float64
 9   PEB_F         54347 non-null  float64
 10  PEB_G         54347 non-null  float64
dtypes: float64(9), int64(2)
memory usage: 5.0 MB


In [287]:
data_test = data_test.interpolate(method='linear')

In [288]:
columns_to_convert = ['LivingArea', 'PEB_A+', 'PEB_A++', 'PEB_B', 'PEB_C', 'PEB_D', 'PEB_E', 'PEB_F', 'PEB_G']
for column in columns_to_convert:
    data_test[column] = data_test[column].astype(int)

data_test.head()

Unnamed: 0,LivingArea,Price,BedroomCount,PEB_A+,PEB_A++,PEB_B,PEB_C,PEB_D,PEB_E,PEB_F,PEB_G
6,391,765000,13,0,0,0,0,1,0,0,0
8,111,399000,4,0,0,1,0,0,0,0,0
11,92,198000,2,0,0,1,0,0,0,0,0
14,50,215000,1,0,0,0,0,0,1,0,0
24,73,360000,2,0,0,0,1,0,0,0,0


In [289]:
y = data_test['Price'].values
X = data_test[['LivingArea', 'PEB_A++', 'PEB_A+', 'PEB_B', 'PEB_C', 'PEB_D', 'PEB_E', 'PEB_F', 'PEB_G']].values

In [290]:
print(type(X))
print(type(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [291]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [292]:
X_train

array([[118,   0,   0, ...,   1,   0,   0],
       [165,   0,   0, ...,   0,   0,   0],
       [ 49,   0,   0, ...,   0,   0,   0],
       ...,
       [157,   0,   0, ...,   0,   0,   1],
       [175,   0,   0, ...,   0,   1,   0],
       [178,   0,   0, ...,   0,   0,   0]])

In [293]:
X_test

array([[158,   0,   0, ...,   1,   0,   0],
       [121,   0,   0, ...,   0,   1,   0],
       [476,   0,   0, ...,   1,   0,   0],
       ...,
       [255,   0,   0, ...,   0,   0,   0],
       [ 87,   0,   0, ...,   0,   0,   0],
       [100,   0,   0, ...,   1,   0,   0]])

In [294]:
print(X_train.shape)

(43477, 9)


In [295]:
print(y_train.shape)

(43477,)


In [296]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [297]:
regressor.score(X_train, y_train)

0.27204152825427685

In [298]:
regressor.predict(X_test)

array([345644.8455818 , 253464.34554134, 828675.34577058, ...,
       532181.02780531, 397368.6618036 , 257544.94303165])

In [299]:
regressor.score(X_test, y_test)

-0.4771349267485534

In [None]:
data.groupby('TypeOfSale')[['TypeOfSale','MonthlyCharges']].agg('count')

In [None]:
data.groupby('PostalCode')[['PostalCode','MonthlyCharges']].agg('count')

In [None]:
sale_data = data[data.TypeOfSale == "residential_sale"]

In [None]:
data_test = sale_data[["LivingArea", "Price", "PEB"]]

In [None]:
data_test.groupby('PEB')['PEB'].agg('count')

In [None]:
data_test['Price'].sort_values()

In [None]:
data_test = data_test.interpolate(method='linear')

In [None]:
data_test['PEB'].unique()