In [492]:
import pandas as pd
import numpy as np
import statistics

import seaborn as sns
from matplotlib import pyplot as plt

from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import metrics

In [493]:
df = pd.read_csv('Property_structured_data.csv')

In [494]:
initial_df = df.shape

In [495]:
initial_df_columns = df.columns.to_list()

## Clean Database

### Price (float)

In [496]:
z_scores_price = stats.zscore(df['price'])
abs_z_scores_price = np.abs(z_scores_price) 
filtered_entries_price = (abs_z_scores_price < 3)
df = df[filtered_entries_price]

In [497]:
df.loc[df["price"] >= 800000, "type_of_property"] = "OTHER"

In [498]:
df= df.loc[df['price'] != -1]
df.shape

(58318, 23)

### Surface (float)

In [499]:
df = df.loc[df['surface'] <= 800]

In [500]:
df = df.loc[df['surface'] >= 35]

In [501]:
df = df.loc[df['surface'] != -1]
df.shape

(47820, 23)

### Sub Type of property (filter and drop)

In [502]:
df['subtype_of_property'].value_counts()

HOUSE                   20431
APARTMENT               15619
VILLA                    2700
APARTMENT_BLOCK          1318
DUPLEX                   1217
GROUND_FLOOR             1131
PENTHOUSE                 967
MIXED_USE_BUILDING        925
EXCEPTIONAL_PROPERTY      511
SERVICE_FLAT              471
FLAT_STUDIO               450
MANSION                   437
TOWN_HOUSE                375
COUNTRY_COTTAGE           292
BUNGALOW                  239
LOFT                      225
FARMHOUSE                 147
TRIPLEX                   114
CHALET                    101
OTHER_PROPERTY             65
MANOR_HOUSE                61
CASTLE                     15
KOT                         8
PAVILION                    1
Name: subtype_of_property, dtype: int64

In [503]:
df= df.loc[df['subtype_of_property'] != 'APARTMENT_BLOCK']
df.shape

(46502, 23)

In [504]:
df= df.loc[df['subtype_of_property'] != 'MIXED_USE_BUILDING']
df.shape

(45577, 23)

### Number of bedrooms (int)

In [505]:
df = df.loc[df['number_of_bedrooms'] <20]


In [506]:
df['number_of_bedrooms'] = df['number_of_bedrooms'].replace(-1,0)

In [507]:
df['number_of_bedrooms'].value_counts()

3     15098
2     14663
4      6745
1      4666
5      2450
6       873
0       538
7       261
8       129
9        75
10       31
11       19
12        7
13        5
14        4
17        3
16        2
15        2
18        1
Name: number_of_bedrooms, dtype: int64

### Zip Code (category)

#### XX

In [508]:
df['zip_code_xx'] = df['postal_code'].apply(lambda x : 'be_zip_'+str(x)[:2])

In [509]:
df['zip_code_xx'].value_counts()

be_zip_10    3312
be_zip_11    1497
be_zip_83    1427
be_zip_20    1248
be_zip_90    1245
             ... 
be_zip_65     159
be_zip_79     131
be_zip_76     109
be_zip_47      84
be_zip_64      51
Name: zip_code_xx, Length: 80, dtype: int64

#### X

In [510]:
df['zip_code_x'] = df['postal_code'].apply(lambda x : 'be_zip_'+str(x)[:1])

In [511]:
df['zip_code_x'].value_counts()

be_zip_1    9529
be_zip_2    7588
be_zip_8    6725
be_zip_9    6238
be_zip_3    3986
be_zip_4    3683
be_zip_7    3066
be_zip_6    3037
be_zip_5    1720
Name: zip_code_x, dtype: int64

### Land surface (float)

In [512]:
#df.loc[(df['land_surface'] == -1) & (df['garden_surface'] > 0 )]

In [513]:
df['land_surface'] = [land_surface if land_surface != -1 else garden_surface if garden_surface > 0 else land_surface for garden_surface,land_surface in zip(df['garden_surface'],df['land_surface'])]

In [514]:
df['land_surface'] = df['land_surface'].replace(-1,0)

In [515]:
df['land_surface'].value_counts()

0       21018
100       206
200       188
300       158
160       152
        ...  
1332        1
3040        1
1894        1
1674        1
4233        1
Name: land_surface, Length: 3305, dtype: int64

### Garden (0,1)

In [516]:
df['garden'] = df['garden'].replace(-1,0)

In [517]:
df['garden'].value_counts()

0    29837
1    15735
Name: garden, dtype: int64

### Garden surface (float)

In [518]:
df['garden_surface'] = df['garden_surface'].replace(-1,0)
df['garden_surface'] = df['garden_surface'].replace(1,0)

In [519]:
df['garden_surface'].value_counts()

0        35949
100        299
200        265
50         194
300        192
         ...  
8782         1
2267         1
586          1
10997        1
7200         1
Name: garden_surface, Length: 1349, dtype: int64

### Fully equiped kitchen  (0,1)

In [520]:
df["fully_equipped_kitchen"] = df["fully_equipped_kitchen"].map({"-1.0": 0.25, "1.0": 1,"-1": 0.25, "1": 1, "INSTALLED": 0.5, "SEMI_EQUIPPED": 0.75, "NOT_INSTALLED": 0, "USA_INSTALLED": 0.5, "USA_SEMI_EQUIPPED": 0.75, "USA_UNINSTALLED": 0})

In [521]:
#df["fully_equipped_kitchen"] = df["fully_equipped_kitchen"].map({"-1.0": 0, "1.0": 1,"-1": 0, "1": 1, "INSTALLED": 0, "SEMI_EQUIPPED": 1, "NOT_INSTALLED": 0, "USA_INSTALLED": 0, "USA_SEMI_EQUIPPED": 1, "USA_UNINSTALLED": 0})

In [522]:
df["fully_equipped_kitchen"].value_counts()

0.50    16744
0.25    14836
1.00     7877
0.75     4302
0.00     1813
Name: fully_equipped_kitchen, dtype: int64

### Swiming pool  (0,1)

In [523]:
df['swimming_pool'] = df['swimming_pool'].replace(-1,0)

In [524]:
df['swimming_pool'].value_counts()

0    44641
1      931
Name: swimming_pool, dtype: int64

### Furnished (0,1)

In [525]:
df['furnished'] = df['furnished'].replace(-1,0)

In [526]:
df['furnished'].value_counts()

0    44489
1     1083
Name: furnished, dtype: int64

### Open fire (0,1)

In [527]:
df['open_fire'] = df['open_fire'].replace(-1,0)

In [528]:
df['open_fire'].value_counts()

0    43505
1     2067
Name: open_fire, dtype: int64

### Terrace (0,1)

In [529]:
df['terrace'] = df['terrace'].replace(-1,0)

In [530]:
df['terrace'].value_counts()

1    29162
0    16410
Name: terrace, dtype: int64

### Terrace surface (float)

we have two terraces > 1000 m2 !

In [531]:
df = df.loc[df['terrace_surface'] < 500]

In [532]:
df['terrace_surface'] = df['terrace_surface'].replace(-1,0)

In [533]:
df['terrace_surface'].value_counts()

0      26370
20      1191
10      1082
15       939
12       866
       ...  
163        1
137        1
190        1
293        1
290        1
Name: terrace_surface, Length: 183, dtype: int64

### Facades (int)

In [534]:
df = df.loc[df["number_of_facades"] < 9]

In [535]:
df["number_of_facades"] = np.where((df["number_of_facades"] == -1) & (df["type_of_property"] == "APARTMENT"), 1, df["number_of_facades"])
df["number_of_facades"] = np.where((df["number_of_facades"] == -1) & (df["type_of_property"] == "HOUSE"), 2, df["number_of_facades"])

In [536]:
df = df.loc[df["number_of_facades"] != -1]

In [537]:
df['number_of_facades'].value_counts()

2    20089
4     9229
3     8114
1     7598
6        6
5        6
8        3
Name: number_of_facades, dtype: int64

### State of the building (category)

Tranform this part to category

In [538]:
"""
df["state_of_the_building"] = df["state_of_the_building"].map({
    "NO_INFO": "TO_RENOVATE",#0.25, 
    "TO_BE_DONE_UP": "JUST_RENOVATED",#0.75, 
    "TO_RENOVATE": "TO_RENOVATE",#0.25, 
    "TO_RESTORE": "TO_REBUILD",#0.25, 
    "JUST_RENOVATED": "JUST_RENOVATED",#0.75, 
    "GOOD": "GOOD",#0.5, 
    "AS_NEW": "NEW"#1.0
})
"""

'\ndf["state_of_the_building"] = df["state_of_the_building"].map({\n    "NO_INFO": "TO_RENOVATE",#0.25, \n    "TO_BE_DONE_UP": "JUST_RENOVATED",#0.75, \n    "TO_RENOVATE": "TO_RENOVATE",#0.25, \n    "TO_RESTORE": "TO_REBUILD",#0.25, \n    "JUST_RENOVATED": "JUST_RENOVATED",#0.75, \n    "GOOD": "GOOD",#0.5, \n    "AS_NEW": "NEW"#1.0\n})\n'

In [539]:
df["state_of_the_building"] = df["state_of_the_building"].map({
    "NO_INFO": 0.25, 
    "TO_BE_DONE_UP": 0.75, 
    "TO_RENOVATE": 0.25, 
    "TO_RESTORE": 0.25, 
    "JUST_RENOVATED": 0.75, 
    "GOOD": 0.5, 
    "AS_NEW": 1.0
})

In [540]:
df["state_of_the_building"].value_counts()

0.25    16219
0.50    12399
1.00    10441
0.75     5986
Name: state_of_the_building, dtype: int64

### Type of property (category)

Separate into HOUSE, APARTMENTS and OTHERS

In [541]:
others = ["CHALET", "MANOR_HOUSE", "OTHER_PROPERTY", "CASTLE", "PAVILION"]

In [542]:
df.loc[df["price"] >= 800000, "type_of_property"] = "OTHER"

In [543]:
df['type_of_property'] = np.where((df['number_of_bedrooms'] > 20), "OTHER", df['type_of_property'])

In [544]:
df.loc[df["subtype_of_property"].isin(others), "type_of_property"] = "OTHER"

In [545]:
df['type_of_property'].value_counts()

HOUSE        23501
APARTMENT    19429
OTHER         2115
Name: type_of_property, dtype: int64

In [546]:
df_other = df.loc[df["type_of_property"] == "OTHER"]

In [547]:
df= df.loc[df["type_of_property"] != "OTHER"]

In [548]:
df.describe()

Unnamed: 0,id,postal_code,price,number_of_bedrooms,surface,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_surface,garden,garden_surface,land_surface,number_of_facades,swimming_pool,state_of_the_building
count,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0,42930.0
mean,9973246.0,5056.555858,326038.838714,2.736921,147.532541,0.502615,0.022246,0.039646,0.635127,9.524482,0.340997,110.448241,460.7907,2.378989,0.013557,0.554251
std,263276.9,3030.481285,150093.338928,1.224795,79.47042,0.278007,0.147483,0.195128,0.4814,18.933273,0.47405,1105.3801,5974.487,0.981126,0.115644,0.29061
min,1882546.0,1000.0,2500.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.25
25%,9942106.0,2180.0,220000.0,2.0,93.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.25
50%,10058090.0,4420.0,295000.0,3.0,130.0,0.5,0.0,0.0,1.0,0.0,0.0,0.0,61.0,2.0,0.0,0.5
75%,10114480.0,8380.0,399000.0,3.0,180.0,0.75,0.0,0.0,1.0,13.0,1.0,0.0,418.0,3.0,0.0,0.75
max,10151150.0,9992.0,799950.0,17.0,800.0,1.0,1.0,1.0,1.0,420.0,1.0,100000.0,1100000.0,6.0,1.0,1.0


### Price/m2

In [549]:
df['price_m2'] = df['price']/df['surface']

In [550]:
df['price_m2'].value_counts()

2500.000000    258
2000.000000    165
3000.000000    160
2750.000000    106
1500.000000    106
              ... 
2160.194175      1
1447.058824      1
2878.205128      1
2852.459016      1
1705.038760      1
Name: price_m2, Length: 18631, dtype: int64

# Crazy zipcode !!!

Values

In [551]:
len(df['zip_code_xx'].value_counts())

80

In [552]:
df_zip_list = ['price_m2','zip_code_xx']

In [553]:
df_zips = df[df_zip_list]

In [554]:
xxx_zip = df_zips.groupby('zip_code_xx')

In [555]:
xxx_zip_list = []
for key, values in xxx_zip:
    xxx_zip_list.append(key)

In [556]:
df_zips_mean = round(df_zips.groupby('zip_code_xx').mean(),2)

In [557]:
df_zips_mean_values = df_zips_mean.values

In [558]:
zip_mean = []
for x in df_zips_mean_values:
    for i in x:
        zip_mean.append(i)

In [559]:
len(zip_mean)

80

In [560]:
global_mean = statistics.mean(zip_mean)

In [561]:
xxx = []
for y,i in enumerate(zip_mean):
    xxx.append(round(i/global_mean,2))

In [562]:
xxx

[1.45,
 1.58,
 1.56,
 1.22,
 1.11,
 1.17,
 1.23,
 1.16,
 1.16,
 1.41,
 1.44,
 1.11,
 1.07,
 1.05,
 0.97,
 1.17,
 1.2,
 1.16,
 1.23,
 1.5,
 1.11,
 1.04,
 1.01,
 0.83,
 1.07,
 0.95,
 0.85,
 0.89,
 0.94,
 0.88,
 0.8,
 0.81,
 0.82,
 0.76,
 0.72,
 0.89,
 0.93,
 0.8,
 0.89,
 0.92,
 0.94,
 0.82,
 0.72,
 0.63,
 0.6,
 0.7,
 0.74,
 0.62,
 0.63,
 0.86,
 0.91,
 0.79,
 0.79,
 0.75,
 0.65,
 0.55,
 0.81,
 0.62,
 0.75,
 0.86,
 0.63,
 1.26,
 1.22,
 1.98,
 1.35,
 1.0,
 1.52,
 1.1,
 0.92,
 0.9,
 1.38,
 1.06,
 1.05,
 0.98,
 0.94,
 0.91,
 0.88,
 1.05,
 1.2,
 1.1]

Send this to the data frame as a column

In [563]:
dic_zip_value = dict()
for i,x in enumerate(xxx_zip_list):
    dic_zip_value[x] = xxx[i]

In [564]:
dic_zip_value

{'be_zip_10': 1.45,
 'be_zip_11': 1.58,
 'be_zip_12': 1.56,
 'be_zip_13': 1.22,
 'be_zip_14': 1.11,
 'be_zip_15': 1.17,
 'be_zip_16': 1.23,
 'be_zip_17': 1.16,
 'be_zip_18': 1.16,
 'be_zip_19': 1.41,
 'be_zip_20': 1.44,
 'be_zip_21': 1.11,
 'be_zip_22': 1.07,
 'be_zip_23': 1.05,
 'be_zip_24': 0.97,
 'be_zip_25': 1.17,
 'be_zip_26': 1.2,
 'be_zip_28': 1.16,
 'be_zip_29': 1.23,
 'be_zip_30': 1.5,
 'be_zip_31': 1.11,
 'be_zip_32': 1.04,
 'be_zip_33': 1.01,
 'be_zip_34': 0.83,
 'be_zip_35': 1.07,
 'be_zip_36': 0.95,
 'be_zip_37': 0.85,
 'be_zip_38': 0.89,
 'be_zip_39': 0.94,
 'be_zip_40': 0.88,
 'be_zip_41': 0.8,
 'be_zip_42': 0.81,
 'be_zip_43': 0.82,
 'be_zip_44': 0.76,
 'be_zip_45': 0.72,
 'be_zip_46': 0.89,
 'be_zip_47': 0.93,
 'be_zip_48': 0.8,
 'be_zip_49': 0.89,
 'be_zip_50': 0.92,
 'be_zip_51': 0.94,
 'be_zip_53': 0.82,
 'be_zip_55': 0.72,
 'be_zip_56': 0.63,
 'be_zip_60': 0.6,
 'be_zip_61': 0.7,
 'be_zip_62': 0.74,
 'be_zip_64': 0.62,
 'be_zip_65': 0.63,
 'be_zip_66': 0.86,
 'be_z

In [565]:
df['zip_code_ponderation'] = df['zip_code_xx']

In [566]:
df

Unnamed: 0,id,locality,postal_code,region,province,type_of_property,subtype_of_property,type_of_sale,price,number_of_bedrooms,...,garden,garden_surface,land_surface,number_of_facades,swimming_pool,state_of_the_building,zip_code_xx,zip_code_x,price_m2,zip_code_ponderation
0,10131114,HEURE,5377,Wallonie,Namur,HOUSE,HOUSE,residential_sale,100000,2,...,0,0,198,3,0,0.25,be_zip_53,be_zip_5,666.666667,be_zip_53
1,10150865,Dinant,5500,Wallonie,Namur,APARTMENT,APARTMENT,residential_sale,219000,2,...,0,0,0,3,0,0.50,be_zip_55,be_zip_5,2190.000000,be_zip_55
3,10022778,Gembloux,5030,Wallonie,Namur,APARTMENT,APARTMENT,residential_sale,285000,2,...,0,0,0,2,0,1.00,be_zip_50,be_zip_5,3518.518519,be_zip_50
4,9989192,Sambreville,5060,Wallonie,Namur,APARTMENT,APARTMENT,residential_sale,284000,2,...,0,0,0,3,0,1.00,be_zip_50,be_zip_5,1893.333333,be_zip_50
5,9951165,Profondeville,5170,Wallonie,Namur,HOUSE,HOUSE,residential_sale,179000,2,...,1,0,1013,4,0,0.25,be_zip_51,be_zip_5,1193.333333,be_zip_51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62422,10121854,Lokeren,9160,Flanders,East Flanders,HOUSE,HOUSE,residential_sale,439900,4,...,1,0,969,4,0,0.75,be_zip_91,be_zip_9,1705.038760,be_zip_91
62424,10117346,Gent Sint-Amandsberg,9040,Flanders,East Flanders,HOUSE,HOUSE,residential_sale,425000,4,...,1,12,80,2,0,1.00,be_zip_90,be_zip_9,3148.148148,be_zip_90
62427,10103172,Denderleeuw,9470,Flanders,East Flanders,HOUSE,HOUSE,residential_sale,235000,1,...,1,55,170,2,0,1.00,be_zip_94,be_zip_9,1382.352941,be_zip_94
62428,9617011,Aalst,9300,Flanders,East Flanders,APARTMENT,APARTMENT,residential_sale,316200,2,...,0,0,0,1,0,1.00,be_zip_93,be_zip_9,3100.000000,be_zip_93


In [567]:
df['zip_code_ponderation'] = df['zip_code_ponderation'].map(dic_zip_value)

In [568]:
df

Unnamed: 0,id,locality,postal_code,region,province,type_of_property,subtype_of_property,type_of_sale,price,number_of_bedrooms,...,garden,garden_surface,land_surface,number_of_facades,swimming_pool,state_of_the_building,zip_code_xx,zip_code_x,price_m2,zip_code_ponderation
0,10131114,HEURE,5377,Wallonie,Namur,HOUSE,HOUSE,residential_sale,100000,2,...,0,0,198,3,0,0.25,be_zip_53,be_zip_5,666.666667,0.82
1,10150865,Dinant,5500,Wallonie,Namur,APARTMENT,APARTMENT,residential_sale,219000,2,...,0,0,0,3,0,0.50,be_zip_55,be_zip_5,2190.000000,0.72
3,10022778,Gembloux,5030,Wallonie,Namur,APARTMENT,APARTMENT,residential_sale,285000,2,...,0,0,0,2,0,1.00,be_zip_50,be_zip_5,3518.518519,0.92
4,9989192,Sambreville,5060,Wallonie,Namur,APARTMENT,APARTMENT,residential_sale,284000,2,...,0,0,0,3,0,1.00,be_zip_50,be_zip_5,1893.333333,0.92
5,9951165,Profondeville,5170,Wallonie,Namur,HOUSE,HOUSE,residential_sale,179000,2,...,1,0,1013,4,0,0.25,be_zip_51,be_zip_5,1193.333333,0.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62422,10121854,Lokeren,9160,Flanders,East Flanders,HOUSE,HOUSE,residential_sale,439900,4,...,1,0,969,4,0,0.75,be_zip_91,be_zip_9,1705.038760,1.06
62424,10117346,Gent Sint-Amandsberg,9040,Flanders,East Flanders,HOUSE,HOUSE,residential_sale,425000,4,...,1,12,80,2,0,1.00,be_zip_90,be_zip_9,3148.148148,1.38
62427,10103172,Denderleeuw,9470,Flanders,East Flanders,HOUSE,HOUSE,residential_sale,235000,1,...,1,55,170,2,0,1.00,be_zip_94,be_zip_9,1382.352941,0.94
62428,9617011,Aalst,9300,Flanders,East Flanders,APARTMENT,APARTMENT,residential_sale,316200,2,...,0,0,0,1,0,1.00,be_zip_93,be_zip_9,3100.000000,0.98


### API list

In [569]:
df= df.loc[df["type_of_property"] == "HOUSE"]

In [570]:
api_list = ['surface','number_of_bedrooms','postal_code','land_surface','garden','garden_surface','fully_equipped_kitchen','swimming_pool','furnished','open_fire','terrace','terrace_surface','number_of_facades','state_of_the_building','zip_code_ponderation','price']

In [571]:
df = df[api_list]

In [572]:
claned_df = df.shape

In [573]:
df

Unnamed: 0,surface,number_of_bedrooms,postal_code,land_surface,garden,garden_surface,fully_equipped_kitchen,swimming_pool,furnished,open_fire,terrace,terrace_surface,number_of_facades,state_of_the_building,zip_code_ponderation,price
0,150,2,5377,198,0,0,0.00,0,0,0,1,0,3,0.25,0.82,100000
5,150,2,5170,1013,1,0,0.00,0,0,0,0,0,4,0.25,0.94,179000
6,510,5,5651,4137,0,0,0.25,1,0,0,0,0,3,0.25,0.63,275000
8,386,3,5543,938,1,757,0.25,0,0,0,1,130,2,0.25,0.72,249000
10,245,4,5060,632,1,632,0.50,0,0,0,1,95,2,0.50,0.92,260000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62420,595,3,9000,332,1,41,0.50,0,0,0,1,41,2,0.50,1.38,725000
62422,258,4,9160,969,1,0,0.25,0,0,0,1,0,4,0.75,1.06,439900
62424,135,4,9040,80,1,12,0.50,0,0,0,1,7,2,1.00,1.38,425000
62427,170,1,9470,170,1,55,0.50,0,0,0,1,30,2,1.00,0.94,235000


## Check database

In [574]:
print(f'initial_df {initial_df} claned_df {claned_df}')

initial_df (62430, 23) claned_df (23501, 16)


In [575]:
df.describe()

Unnamed: 0,surface,number_of_bedrooms,postal_code,land_surface,garden,garden_surface,fully_equipped_kitchen,swimming_pool,furnished,open_fire,terrace,terrace_surface,number_of_facades,state_of_the_building,zip_code_ponderation,price
count,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0,23501.0
mean,186.317221,3.347262,5378.0517,809.3251,0.505851,169.344666,0.483618,0.020042,0.011191,0.061104,0.566742,9.275988,2.782307,0.523573,1.013091,346599.499255
std,81.7648,1.173955,2828.775426,7958.722,0.499976,792.836031,0.268502,0.140146,0.105196,0.239526,0.495536,20.80776,0.84542,0.27131,0.259503,159645.741694
min,35.0,0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.55,2500.0
25%,135.0,3.0,2820.0,155.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,2.0,0.25,0.82,229900.0
50%,170.0,3.0,5032.0,360.0,1.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0,3.0,0.5,1.0,325000.0
75%,219.0,4.0,8310.0,792.0,1.0,78.0,0.5,0.0,0.0,0.0,1.0,13.0,4.0,0.75,1.16,439000.0
max,800.0,17.0,9992.0,1100000.0,1.0,55000.0,1.0,1.0,1.0,1.0,1.0,420.0,6.0,1.0,1.98,799950.0


In [576]:
for index,atribute in enumerate(df.columns.to_list()):
    print(atribute)
    print(df[df.columns.to_list()[index]].value_counts())
    print('\n')

surface
150    700
160    562
200    554
180    547
140    518
      ... 
623      1
49       1
633      1
453      1
595      1
Name: surface, Length: 554, dtype: int64


number_of_bedrooms
3     10647
4      5696
2      3687
5      1927
6       625
1       450
7       170
0       130
8        84
9        46
10       14
11       11
12        5
16        2
13        2
15        2
14        2
17        1
Name: number_of_bedrooms, dtype: int64


postal_code
9000    272
7700    199
9300    186
4000    166
9500    151
       ... 
6662      1
5372      1
3384      1
4161      1
6836      1
Name: postal_code, Length: 1035, dtype: int64


land_surface
0       2203
100      160
200      145
160      143
300      138
        ... 
3590       1
1861       1
2197       1
3430       1
5414       1
Name: land_surface, Length: 2808, dtype: int64


garden
1    11888
0    11613
Name: garden, dtype: int64


garden_surface
0       15964
100       246
200       219
300       167
500       152
        ... 

In [577]:
list_drop = [
    "postal_code",
    "garden",
    "garden_surface",
    "furnished",
    "open_fire",
    "terrace",
    "terrace_surface"]

df = df.drop(columns=list_drop, axis=1)

In [578]:
df

Unnamed: 0,surface,number_of_bedrooms,land_surface,fully_equipped_kitchen,swimming_pool,number_of_facades,state_of_the_building,zip_code_ponderation,price
0,150,2,198,0.00,0,3,0.25,0.82,100000
5,150,2,1013,0.00,0,4,0.25,0.94,179000
6,510,5,4137,0.25,1,3,0.25,0.63,275000
8,386,3,938,0.25,0,2,0.25,0.72,249000
10,245,4,632,0.50,0,2,0.50,0.92,260000
...,...,...,...,...,...,...,...,...,...
62420,595,3,332,0.50,0,2,0.50,1.38,725000
62422,258,4,969,0.25,0,4,0.75,1.06,439900
62424,135,4,80,0.50,0,2,1.00,1.38,425000
62427,170,1,170,0.50,0,2,1.00,0.94,235000


In [579]:
df.corr()["price"]

surface                   0.533437
number_of_bedrooms        0.384784
land_surface              0.055549
fully_equipped_kitchen    0.209290
swimming_pool             0.160394
number_of_facades         0.253947
state_of_the_building     0.173194
zip_code_ponderation      0.446084
price                     1.000000
Name: price, dtype: float64

# Split Data

In [580]:
y = df.iloc[:,-1].values  #price
x = df.iloc[:,:-1].values  #rest

In [581]:
x = x.astype(int)

In [582]:
y = y.astype(int)

# Linear regresion

### First split

In [583]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,shuffle=False)

In [584]:
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2, shuffle=False)

In [585]:
print("x_train:",x_train.shape)
print("x_dev:",x_dev.shape)
print("x_test:",x_test.shape)
print("y_train:",y_train.shape)
print("y_dev:",y_dev.shape)
print("y_test:",y_test.shape)

x_train: (15040, 8)
x_dev: (3760, 8)
x_test: (4701, 8)
y_train: (15040,)
y_dev: (3760,)
y_test: (4701,)


In [586]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler().fit(x_train)
# x_train = scaler.transform(x_train)

### Train the model

In [587]:
MLR = LinearRegression()

In [588]:
MLR.fit(x_train,y_train)

In [589]:
print(f'constant : {MLR.intercept_}')
print(f'constant : {MLR.coef_}')

constant : -51882.98617614037
constant : [7.19172934e+02 1.80009673e+04 3.80157532e-01 6.70890684e+04
 6.79427040e+04 3.97330668e+04 4.96918886e+04 1.45053100e+05]


In [590]:
y_predict = MLR.predict(x_test)

In [591]:
df_mlr =  pd.DataFrame(data=y_test, columns =['y_test'])
df_mlr['y_predict'] = y_predict
df_mlr
# df_mlr['delta'] = round((df_mlr['y_train'] - df_mlr['predict'])/df_mlr['y_train'],2)

Unnamed: 0,y_test,y_predict
0,362452,317453.000765
1,695000,680582.998591
2,360000,365298.717129
3,275000,322754.919063
4,369000,405982.977797
...,...,...
4696,725000,654673.256894
4697,439900,510021.239550
4698,425000,391450.763630
4699,235000,217600.028934


In [592]:
df_mlr.describe()

Unnamed: 0,y_test,y_predict
count,4701.0,4701.0
mean,385009.671559,373917.204803
std,145695.321429,110138.181628
min,75000.0,84442.262659
25%,275000.0,303552.764414
50%,365000.0,374159.672761
75%,469000.0,439750.562946
max,799850.0,869899.694252


### Predict a single value

In [593]:
test_objt = x_dev[1]  #test object

In [594]:
test_objt

array([114,   3, 260,   0,   0,   2,   0,   1])

In [595]:
y_dev[1] #desired output

370000

In [596]:
y_predict_single_objt = MLR.predict(test_objt.reshape(1,-1))
round(float(y_predict_single_objt),2)

308723.7

In [597]:
(round(float(y_predict_single_objt),2) - y_dev[1])/y_dev[1]

-0.1656116216216216

### Model evaluation

In [598]:
r_sqr = metrics.r2_score(y_test,y_predict)
print('R-square Error:',r_sqr)

R-square Error: 0.4013185518426796


In [599]:
n = 29632 #number of rows
p = 27 #number of columns
adj_r_sqr = 1-(1-r_sqr)*(n-1)/(n-p-1)
print('Adjusted R-square Error:',adj_r_sqr)

Adjusted R-square Error: 0.4007725310650736


In [600]:
import math
print(f"Square Root of R-Square {math.sqrt(r_sqr)}")
print(f"Square Root of Adjusted R-Square {math.sqrt(adj_r_sqr)}")

Square Root of R-Square 0.6334970811635043
Square Root of Adjusted R-Square 0.6330659768658189
