# C. Descriptive : sample description 

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [90]:
to_keep = pd.read_csv('DEF_IMMO.csv', sep=',', na_values='None')


In [20]:
to_keep.columns

Index(['Unnamed: 0', 'type_of_property', 'subtype_of_property', 'locality',
       'province', 'région', 'price', 'number_of_facades', 'house_area',
       'house_area_cat', 'number_of_rooms', 'number_of_rooms_cat',
       'surface_of_the_land', 'surface_of_the_land_cat',
       'fully_equipped_kitchen', 'garden', 'garden_area', 'terrace',
       'terrace_area', 'open_fire', 'swimming_pool', 'state_of_the_building',
       'construction_year', 'construction_year_cat'],
      dtype='object')

In [21]:
to_keep.head(3)

Unnamed: 0.1,Unnamed: 0,type_of_property,subtype_of_property,locality,province,région,price,number_of_facades,house_area,house_area_cat,...,fully_equipped_kitchen,garden,garden_area,terrace,terrace_area,open_fire,swimming_pool,state_of_the_building,construction_year,construction_year_cat
0,0,house,house,1050,Bruxelles-Capitale,Région Bruxelles-capitale,340000,2.0,203.0,]180-240 m²],...,1,0,,1,,0,0,to be done up,1901.0,1900-1919
1,1,house,villa,1880,Brabant flamand,Région flamande,525000,4.0,250.0,]240m²- ],...,1,1,430.0,1,40.0,0,0,as new,1992.0,1980-1999
2,2,house,exceptional property,4900,Liège,Région wallonne,550000,4.0,475.0,]240m²- ],...,1,1,1400.0,1,,0,0,good,1853.0,older than 1900


## Missing data 

In [22]:
to_keep.isnull().sum()

Unnamed: 0                     0
type_of_property               0
subtype_of_property            0
locality                       0
province                       0
région                         0
price                          0
number_of_facades          10644
house_area                   127
house_area_cat               127
number_of_rooms                0
number_of_rooms_cat            0
surface_of_the_land        19987
surface_of_the_land_cat    19987
fully_equipped_kitchen         0
garden                         0
garden_area                34488
terrace                        0
terrace_area               26057
open_fire                      0
swimming_pool                  0
state_of_the_building      10256
construction_year          16216
construction_year_cat      16216
dtype: int64

## 1. Type & subtype of properties

In [23]:
to_keep.groupby(['type_of_property']).subtype_of_property.count()

type_of_property
apartment    19852
house        22500
Name: subtype_of_property, dtype: int64

In [24]:
to_keep.groupby(['type_of_property', 'subtype_of_property']).subtype_of_property.count()

type_of_property  subtype_of_property 
apartment         apartment               15329
                  duplex                   1278
                  flat studio               724
                  ground floor             1098
                  kot                        67
                  loft                      199
                  penthouse                 830
                  service flat              255
                  triplex                    72
house             apartment block           794
                  bungalow                  226
                  castle                      7
                  chalet                    112
                  country cottage           236
                  exceptional property      381
                  farmhouse                  98
                  house                   16651
                  manor house                68
                  mansion                   394
                  mixed use building        741
 

## 2. Provinces & régions 

In [25]:
to_keep.groupby(['province']).province.count()

province
Anvers                 5510
Brabant flamand        3888
Brabant wallon         1806
Bruxelles-Capitale     4260
Flandre-Occidentale    7670
Flandre-Orientale      5234
Hainaut                4348
Limbourg               2625
Liège                  4179
Luxembourg             1182
Namur                  1650
Name: province, dtype: int64

In [26]:
to_keep.groupby(['région']).région.count()

région
Région Bruxelles-capitale     4260
Région flamande              24927
Région wallonne              13165
Name: région, dtype: int64

## 3. number_of_facades

In [27]:
to_keep.groupby(['number_of_facades']).province.count()

number_of_facades
1.0      460
2.0    15374
3.0     7616
4.0     8258
Name: province, dtype: int64

In [37]:
# % of missing values 
mask_appartment = to_keep['type_of_property'] == 'apartment'
mask_house = to_keep['type_of_property'] == 'house'
print(to_keep.groupby(['type_of_property']).subtype_of_property.count())
print('Check for Nan values number of facades in houses')
print(to_keep[mask_house].number_of_facades.isna().value_counts())
print('Check for Nan values number of facades in appartments')
print(to_keep[mask_appartment].number_of_facades.isna().value_counts())

type_of_property
apartment    19852
house        22500
Name: subtype_of_property, dtype: int64
Check for Nan values number of facades in houses
False    19626
True      2874
Name: number_of_facades, dtype: int64
Check for Nan values number of facades in appartments
False    12082
True      7770
Name: number_of_facades, dtype: int64


In [40]:
print('{}% missing values in  "number_of_facades" for houses'.format(round((2874/22500)*100, 1)))
print('{}% missing values in "number_of_facades" for appartments'.format(round((7770/19626)*100, 1)))

12.8% missing values in  "number_of_facades" for houses
39.6% missing values in "number_of_facades" for appartments


## 4. house_area_cat

In [28]:
to_keep.groupby(['house_area_cat']).province.count()

house_area_cat
[0-60 m²]        2699
]120-180 m²]    11653
]180-240 m²]     5803
]240m²- ]        5245
]60-120 m²]     16825
Name: province, dtype: int64

In [79]:
# % of missing values for house_area
to_keep['house_area'].isna().value_counts()

False    42225
True       127
Name: house_area, dtype: int64

In [91]:
print('{}% missing house_area values'.format(round((127/42351)*100, 1)))

0.3% missing house_area values


## 5. number_of_rooms_cat

In [29]:
to_keep.groupby(['number_of_rooms_cat']).province.count()

number_of_rooms_cat
0 chambre(s)      817
1 chambre(s)     4313
2 chambre(s)    14063
3 chambre(s)    13564
4 chambre(s)     5936
5 et plus        3659
Name: province, dtype: int64

## 6. surface_of_the_land

In [82]:
print(to_keep.groupby(['type_of_property']).subtype_of_property.count())
print(to_keep['surface_of_the_land'].isna().value_counts())

type_of_property
apartment    19852
house        22500
Name: subtype_of_property, dtype: int64
False    22365
True     19987
Name: surface_of_the_land, dtype: int64


In [84]:
# % of missing values for surface_of_the_land
print('{}% missing surface_of_the_land values for houses'.format(round(((19987-19852)/22500)*100, 1)))

0.6% missing surface_of_the_land values for houses


## 7. garden

In [30]:
to_keep.groupby(['garden']).province.count()

garden
0    28990
1    13362
Name: province, dtype: int64

In [81]:
# % of missing values for garden_area 
print(to_keep['garden_area'].isna().value_counts())

True     34488
False     7864
Name: garden_area, dtype: int64


In [87]:
# subtract the no garden number before making the percentage 
print('{}% missing garden_area values'.format(round(((34488-28990)/13362)*100, 1)))

41.1% missing garden_area values


## 8. terrace

In [31]:
to_keep.groupby(['terrace']).province.count()

terrace
0    16497
1    25855
Name: province, dtype: int64

In [56]:
# % of missing values for terrace_area 
to_keep['terrace_area'].isna().value_counts()

True     26057
False    16295
Name: terrace_area, dtype: int64

In [58]:
# subtract the no terrace number before making the percentage 
print('{}% missing terrace_area values'.format(round(((26057-16497)/25855)*100, 1)))

37.0% missing terrace_area values


## 9. open_fire

In [32]:
to_keep.groupby(['open_fire']).province.count()

open_fire
0    40153
1     2199
Name: province, dtype: int64

## 10. state_of_the_building

In [33]:
to_keep.groupby(['state_of_the_building']).province.count()

state_of_the_building
as new            12489
good              11601
just renovated     2271
to be done up      2987
to renovate        2591
to restore          157
Name: province, dtype: int64

In [60]:
# % of missing values for state_of_the_building
to_keep['state_of_the_building'].isna().value_counts()

False    32096
True     10256
Name: state_of_the_building, dtype: int64

In [92]:
print('{}% missing state_of_the_building values'.format(round((10256/42351)*100, 1)))

24.2% missing state_of_the_building values


## 11. construction_year_cat

In [34]:
to_keep.groupby(['construction_year_cat']).province.count()

construction_year_cat
1900-1919            801
1920-1939           1719
1940-1959           2173
1960-1979           5673
1980-1999           3248
2000-              11813
older than 1900      709
Name: province, dtype: int64

In [62]:
# % of missing values for construction_year
to_keep['construction_year'].isna().value_counts()

False    26136
True     16216
Name: construction_year, dtype: int64

In [93]:
print('{}% missing constructuion_year values'.format(round((16216/42351)*100, 1)))

38.3% missing constructuion_year values
