In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from scipy.stats import zscore

In [2]:
df = pd.read_csv('Property_structured_data.csv')

In [3]:
initial_df = df.shape

In [4]:
initial_df_columns = df.columns.to_list()

## Clean Database

### Price

In [5]:
df= df.loc[df['price'] != -1]
df.shape

(59429, 23)

### Surface

In [6]:
df = df.loc[df['surface'] != -1]
df.shape

(49412, 23)

### Sub Type of property

In [7]:
df['subtype_of_property'].value_counts()

HOUSE                   20679
APARTMENT               15864
VILLA                    2954
APARTMENT_BLOCK          1414
DUPLEX                   1247
GROUND_FLOOR             1146
PENTHOUSE                1009
MIXED_USE_BUILDING        993
FLAT_STUDIO               693
EXCEPTIONAL_PROPERTY      636
MANSION                   494
SERVICE_FLAT              483
TOWN_HOUSE                376
COUNTRY_COTTAGE           301
BUNGALOW                  243
LOFT                      231
FARMHOUSE                 155
TRIPLEX                   119
CHALET                    107
MANOR_HOUSE                76
OTHER_PROPERTY             72
CASTLE                     61
KOT                        58
PAVILION                    1
Name: subtype_of_property, dtype: int64

In [8]:
df= df.loc[df['subtype_of_property'] != 'APARTMENT_BLOCK']
df.shape

(47998, 23)

In [9]:
df= df.loc[df['subtype_of_property'] != 'MIXED_USE_BUILDING']
df.shape

(47005, 23)

### Number of bedrooms

In [10]:
df['number_of_bedrooms'] = df['number_of_bedrooms'].replace(-1,0)

In [11]:
df['number_of_bedrooms'].value_counts()

3       15278
2       14724
4        6945
1        4875
5        2662
6        1032
0         755
7         323
8         168
9          89
10         52
12         21
11         21
14         13
13          7
16          5
15          5
20          4
17          4
18          3
25          3
19          2
28          2
22          2
23          1
26          1
66          1
70          1
24          1
45          1
30          1
204         1
31          1
2020        1
Name: number_of_bedrooms, dtype: int64

### Land surface

#### Land surface of apartments = garden 

for apartments garden area = land surface

#### Land surface = 0

In [12]:
df['land_surface'] = df['land_surface'].replace(-1,0)

In [13]:
df['land_surface'].value_counts()

0         23206
100         167
200         157
300         147
160         145
          ...  
2761          1
6832          1
2006          1
2507          1
175000        1
Name: land_surface, Length: 3518, dtype: int64

### Garden

In [14]:
df['garden'] = df['garden'].replace(-1,0)

In [15]:
df['garden'].value_counts()

0    30840
1    16165
Name: garden, dtype: int64

### Garden surface

In [16]:
df['garden_surface'] = df['garden_surface'].replace(-1,0)
df['garden_surface'] = df['garden_surface'].replace(1,0)

In [17]:
df['garden_surface'].value_counts()

0       37111
100       304
200       270
50        198
300       196
        ...  
292         1
1115        1
568         1
1043        1
7200        1
Name: garden_surface, Length: 1413, dtype: int64

### Fully equiped kitchen

In [18]:
df["fully_equipped_kitchen"] = df["fully_equipped_kitchen"].map({"-1.0": 0.25, "1.0": 1,"-1": 0.25, "1": 1, "INSTALLED": 0.5, "SEMI_EQUIPPED": 0.75, "NOT_INSTALLED": 0, "USA_INSTALLED": 0.5, "USA_SEMI_EQUIPPED": 0.75, "USA_UNINSTALLED": 0})

In [19]:
df["fully_equipped_kitchen"].value_counts()

0.50    17109
0.25    15316
1.00     8351
0.75     4372
0.00     1857
Name: fully_equipped_kitchen, dtype: int64

### Swiming pool

In [20]:
df['swimming_pool'] = df['swimming_pool'].replace(-1,0)

In [21]:
df['swimming_pool'].value_counts()

0    45865
1     1140
Name: swimming_pool, dtype: int64

### Furnished

In [22]:
df['furnished'] = df['furnished'].replace(-1,0)

In [23]:
df['furnished'].value_counts()

0    45756
1     1249
Name: furnished, dtype: int64

### Open fire

In [24]:
df['open_fire'] = df['open_fire'].replace(-1,0)

In [25]:
df['open_fire'].value_counts()

0    44779
1     2226
Name: open_fire, dtype: int64

### Terrace

In [26]:
df['terrace'] = df['terrace'].replace(-1,0)

In [27]:
df['terrace'].value_counts()

1    30074
0    16931
Name: terrace, dtype: int64

### Terrace surface

we have two terraces > 1000 m2 !

In [28]:
df = df.loc[df['terrace_surface'] < 500]

In [29]:
df['terrace_surface'] = df['terrace_surface'].replace(-1,0)

In [30]:
df['terrace_surface'].value_counts()

0      27371
20      1197
10      1088
15       948
12       870
       ...  
137        1
190        1
210        1
370        1
290        1
Name: terrace_surface, Length: 195, dtype: int64

### Facades

In [31]:
df = df.loc[df["number_of_facades"] < 9]

In [32]:
df["number_of_facades"] = np.where((df["number_of_facades"] == -1) & (df["type_of_property"] == "APARTMENT"), 1, df["number_of_facades"])
df["number_of_facades"] = np.where((df["number_of_facades"] == -1) & (df["type_of_property"] == "HOUSE"), 2, df["number_of_facades"])

In [33]:
df['number_of_facades'].value_counts()

2    20799
4     9804
3     8238
1     8126
6        6
5        6
8        4
Name: number_of_facades, dtype: int64

### State of the building

In [34]:
df["state_of_the_building"] = df["state_of_the_building"].map({"NO_INFO": 0.25, "TO_BE_DONE_UP": 0.75, "TO_RENOVATE": 0.25, "TO_RESTORE": 0.25, "JUST_RENOVATED": 0.75, "GOOD": 0.5, "AS_NEW": 1.0})

In [35]:
df["state_of_the_building"].value_counts()

0.25    16805
0.50    12785
1.00    11216
0.75     6177
Name: state_of_the_building, dtype: int64

### Type of property

In [36]:
df['type_of_property'] = df['type_of_property'].map({'APARTMENT':1,'HOUSE':2})

In [37]:
df['type_of_property'].value_counts()

2    26138
1    20845
Name: type_of_property, dtype: int64

### API list

In [38]:
api_list = ['price','surface','type_of_property','number_of_bedrooms','postal_code','land_surface','garden','garden_surface','fully_equipped_kitchen','swimming_pool','furnished','open_fire','terrace','terrace_surface','number_of_facades','state_of_the_building']

In [39]:
df = df[api_list]

In [40]:
claned_df = df.shape

## Check database

In [41]:
print(f'initial_df {initial_df} claned_df {claned_df}')

initial_df (62430, 23) claned_df (46983, 16)


In [42]:
df.describe()

Unnamed: 0,price,surface,type_of_property,number_of_bedrooms,postal_code,land_surface,garden,garden_surface,fully_equipped_kitchen,swimming_pool,furnished,open_fire,terrace,terrace_surface,number_of_facades,state_of_the_building
count,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0,46983.0
mean,411866.8,168.096907,1.556329,2.897878,4989.489773,855.1362,0.343805,185.388013,0.510887,0.024222,0.026584,0.047336,0.639678,10.437158,2.421131,0.56281
std,432598.9,220.547766,0.496822,9.474797,3045.797793,9203.217,0.474982,2441.990116,0.283467,0.153738,0.160866,0.212359,0.480099,22.146879,1.005937,0.293786
min,2500.0,1.0,1.0,0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25
25%,225000.0,94.0,1.0,2.0,2140.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,2.0,0.25
50%,300000.0,135.0,2.0,3.0,4250.0,50.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0,2.0,0.5
75%,445000.0,194.0,2.0,3.0,8370.0,471.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,14.0,3.0,0.75
max,15000000.0,22617.0,2.0,2020.0,9992.0,1100000.0,1.0,150000.0,1.0,1.0,1.0,1.0,1.0,420.0,8.0,1.0


In [43]:
for index,atribute in enumerate(df.columns.to_list()):
    print(atribute)
    print(df[df.columns.to_list()[index]].value_counts().head(10))
    print('\n')

price
299000    762
249000    672
295000    666
199000    623
275000    599
395000    558
399000    502
225000    498
349000    495
325000    490
Name: price, dtype: int64


surface
100    890
150    884
90     835
120    804
110    717
140    694
80     679
160    670
200    665
130    649
Name: surface, dtype: int64


type_of_property
2    26138
1    20845
Name: type_of_property, dtype: int64


number_of_bedrooms
3    15271
2    14722
4     6939
1     4873
5     2662
6     1029
0      754
7      323
8      168
9       89
Name: number_of_bedrooms, dtype: int64


postal_code
1180    749
8300    706
1000    693
9000    686
8400    583
1050    569
8370    517
2000    493
1070    420
1030    419
Name: postal_code, dtype: int64


land_surface
0      23198
100      167
200      157
300      147
160      145
250      140
150      140
120      135
170      131
180      129
Name: land_surface, dtype: int64


garden
0    30830
1    16153
Name: garden, dtype: int64


garden_surface
0      37101


In [44]:
df.to_csv("regresion_data.csv",index=False)

# Zipcode

In [45]:
df['postal_code'].value_counts()

1180    749
8300    706
1000    693
9000    686
8400    583
       ... 
5564      1
8956      1
8957      1
8950      1
6922      1
Name: postal_code, Length: 1048, dtype: int64

In [46]:
zip_counts = dict()
for counts in df['postal_code']:
    #print (str(counts)[:2])
    #break
    zip_counts[str(counts)[:2]] = zip_counts.get(str(counts)[:2],0)+1

In [47]:
len(zip_counts)

80

In [48]:
zip_counts

{'53': 265,
 '55': 402,
 '50': 543,
 '51': 239,
 '56': 302,
 '69': 312,
 '66': 170,
 '67': 407,
 '68': 270,
 '10': 3494,
 '11': 1661,
 '12': 360,
 '71': 551,
 '61': 347,
 '70': 669,
 '60': 843,
 '65': 160,
 '77': 500,
 '73': 512,
 '62': 511,
 '78': 348,
 '76': 111,
 '79': 131,
 '75': 287,
 '64': 53,
 '45': 321,
 '41': 457,
 '40': 1074,
 '44': 411,
 '46': 374,
 '47': 88,
 '49': 237,
 '48': 442,
 '42': 162,
 '43': 171,
 '13': 947,
 '14': 1153,
 '21': 999,
 '28': 1009,
 '22': 628,
 '26': 961,
 '24': 553,
 '25': 702,
 '29': 899,
 '23': 707,
 '20': 1336,
 '85': 994,
 '84': 1275,
 '88': 709,
 '83': 1628,
 '86': 872,
 '87': 665,
 '89': 412,
 '80': 337,
 '82': 187,
 '39': 447,
 '35': 820,
 '37': 286,
 '38': 381,
 '36': 423,
 '16': 490,
 '34': 163,
 '31': 232,
 '33': 301,
 '30': 721,
 '17': 744,
 '15': 320,
 '19': 321,
 '18': 541,
 '32': 321,
 '98': 625,
 '95': 403,
 '93': 570,
 '91': 935,
 '90': 1272,
 '99': 522,
 '97': 244,
 '96': 545,
 '92': 795,
 '94': 403}

In [49]:
zip_cleaned = []
for zipcode,count in zip_counts.items():
    if count > 3:
        zip_cleaned.append(zipcode)

In [50]:
len(zip_cleaned)

80

In [51]:
zip_cleaned

['53',
 '55',
 '50',
 '51',
 '56',
 '69',
 '66',
 '67',
 '68',
 '10',
 '11',
 '12',
 '71',
 '61',
 '70',
 '60',
 '65',
 '77',
 '73',
 '62',
 '78',
 '76',
 '79',
 '75',
 '64',
 '45',
 '41',
 '40',
 '44',
 '46',
 '47',
 '49',
 '48',
 '42',
 '43',
 '13',
 '14',
 '21',
 '28',
 '22',
 '26',
 '24',
 '25',
 '29',
 '23',
 '20',
 '85',
 '84',
 '88',
 '83',
 '86',
 '87',
 '89',
 '80',
 '82',
 '39',
 '35',
 '37',
 '38',
 '36',
 '16',
 '34',
 '31',
 '33',
 '30',
 '17',
 '15',
 '19',
 '18',
 '32',
 '98',
 '95',
 '93',
 '91',
 '90',
 '99',
 '97',
 '96',
 '92',
 '94']