In [61]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point

In [62]:
file_path =  r'C:\Users\Rik\Desktop\immo-eliza-team6-analysis\analysis\clean_data.csv'

df = pd.read_csv(file_path, sep = ',')

In [63]:
df.head(30)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  5852 non-null   int64  
 1   locality_name       5852 non-null   object 
 2   Postal_code         5852 non-null   int64  
 3   Price               5852 non-null   int64  
 4   Subtype             5852 non-null   object 
 5   Number_of_bedrooms  5852 non-null   int64  
 6   Living_area         5852 non-null   int64  
 7   street              5852 non-null   object 
 8   number              5831 non-null   object 
 9   latitude            5847 non-null   float64
 10  longitude           5847 non-null   float64
 11  Open_fire           5852 non-null   int64  
 12  Swimming_Pool       5852 non-null   int64  
 13  hasTerrace          5852 non-null   int64  
 14  terraceSurface      5852 non-null   int64  
 15  gardenSurface       5852 non-null   int64  
 16  Kitche

In [64]:
# transform 'int64' to 'Int64' to handle NaN

int_columns = df.select_dtypes(include=['int64'])

for col in int_columns.columns:
    df[col] = df[col].astype('Int64')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  5852 non-null   Int64  
 1   locality_name       5852 non-null   object 
 2   Postal_code         5852 non-null   Int64  
 3   Price               5852 non-null   Int64  
 4   Subtype             5852 non-null   object 
 5   Number_of_bedrooms  5852 non-null   Int64  
 6   Living_area         5852 non-null   Int64  
 7   street              5852 non-null   object 
 8   number              5831 non-null   object 
 9   latitude            5847 non-null   float64
 10  longitude           5847 non-null   float64
 11  Open_fire           5852 non-null   Int64  
 12  Swimming_Pool       5852 non-null   Int64  
 13  hasTerrace          5852 non-null   Int64  
 14  terraceSurface      5852 non-null   Int64  
 15  gardenSurface       5852 non-null   Int64  
 16  Kitche

In [65]:
# transform dtypes 'object' to 'category'
int_columns = df.select_dtypes(include=['object'])

for col in int_columns.columns:
    df[col] = df[col].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   id                  5852 non-null   Int64   
 1   locality_name       5852 non-null   category
 2   Postal_code         5852 non-null   Int64   
 3   Price               5852 non-null   Int64   
 4   Subtype             5852 non-null   category
 5   Number_of_bedrooms  5852 non-null   Int64   
 6   Living_area         5852 non-null   Int64   
 7   street              5852 non-null   category
 8   number              5831 non-null   category
 9   latitude            5847 non-null   float64 
 10  longitude           5847 non-null   float64 
 11  Open_fire           5852 non-null   Int64   
 12  Swimming_Pool       5852 non-null   Int64   
 13  hasTerrace          5852 non-null   Int64   
 14  terraceSurface      5852 non-null   Int64   
 15  gardenSurface       5852 non-null   In

In [66]:
#Getting an idea of the values in columns with dtype 'category'
pd.set_option('display.max_rows', 30)

for col in df.select_dtypes(include=['category']).columns:
    print(f"Counts for {col}:")
    print(df[col].value_counts())
    unique_count = df[col].nunique()
    print("\n")
    print(f"Number of unique entries in {col}: {unique_count}\n")

Counts for locality_name:
locality_name
Gent              181
Antwerpen         109
Aalst              79
Ronse              78
Geraardsbergen     65
                 ... 
Wilsele             1
Winksele            1
Wodecq              1
Woesten             1
Wommersom           1
Name: count, Length: 1172, dtype: int64


Number of unique entries in locality_name: 1172

Counts for Subtype:
Subtype
House              5101
Villa               411
Town house           99
Bungalow             79
Mansion              72
Country cottage      52
Farmhouse            23
Chalet               13
Manor house           2
Name: count, dtype: int64


Number of unique entries in Subtype: 9

Counts for street:
street
Stationsstraat         36
Molenstraat            35
Kerkstraat             25
Hoogstraat             18
Veldstraat             17
                       ..
Boterhoek               1
Zwartkloosterstraat     1
Zwevegemsestraat        1
Botermelkstraat         1
Sterrebos               1
Nam

In [67]:
#POSTAL_CODE

#Converting the postal codes to dtype 'str'

df['Postal_code'] = df['Postal_code'].astype(str)

In [68]:
pd.set_option('display.max_rows', None)
df['Postal_code'].value_counts()

Postal_code
9000    143
9600     78
9300     69
9500     67
9700     62
8800     61
9900     49
2800     48
9620     47
8000     46
9050     44
8400     43
9550     43
8870     43
3300     43
9230     42
9800     41
1600     38
9940     38
4000     37
8310     37
9340     36
9420     35
9200     35
1790     34
9255     34
2660     34
6700     33
9400     33
8560     32
2830     32
8200     32
9280     32
8500     32
4100     31
9160     31
9820     31
9100     31
4400     30
9810     29
9660     28
2500     28
9240     27
8501     26
2880     26
8820     26
2940     24
9520     24
2580     24
1840     23
9860     23
6940     23
8020     23
8680     23
9890     23
9040     23
8930     22
8790     22
9220     22
2018     22
8830     21
9630     21
2140     21
9140     21
8600     21
3800     21
2610     20
1500     20
1800     20
4020     20
2060     20
9090     20
9070     20
9990     20
6800     19
9032     19
8470     19
8530     19
2100     19
9570     18
8770     18
9080     18
4040

In [69]:
#dropping a dutch postal code
df = df[df['Postal_code'] != '4524 JM']


In [70]:
df['Postal_code'].value_counts()

Postal_code
9000    143
9600     78
9300     69
9500     67
9700     62
8800     61
9900     49
2800     48
9620     47
8000     46
9050     44
8400     43
9550     43
8870     43
3300     43
9230     42
9800     41
1600     38
9940     38
4000     37
8310     37
9340     36
9420     35
9200     35
1790     34
9255     34
2660     34
6700     33
9400     33
8560     32
2830     32
8200     32
9280     32
8500     32
4100     31
9160     31
9820     31
9100     31
4400     30
9810     29
9660     28
2500     28
9240     27
8501     26
2880     26
8820     26
2940     24
9520     24
2580     24
1840     23
9860     23
6940     23
8020     23
8680     23
9890     23
9040     23
8930     22
8790     22
9220     22
2018     22
8830     21
9630     21
2140     21
9140     21
8600     21
3800     21
2610     20
1500     20
1800     20
4020     20
2060     20
9090     20
9070     20
9990     20
6800     19
9032     19
8470     19
8530     19
2100     19
9570     18
8770     18
9080     18
4040

In [71]:
#Aggregate Postal code by price
Postal_aggregates = df.groupby('Postal_code').agg({'Price': ['mean', 'median']}).reset_index()


In [72]:
#creating the column province

def get_province(postal_code):
    if postal_code.startswith('1'):
        return 'Brussels' if int(postal_code) < 1300 else 'Brabant_Wallon'
    elif postal_code.startswith('2'):
        return 'Antwerp'
    elif postal_code.startswith('4'):
        return 'Liège'
    elif postal_code.startswith('5'):
        return 'Namur'
    elif postal_code.startswith('6'):
        return 'Luxembourg'
    elif postal_code.startswith('7'):
        return 'Hainaut'
    elif postal_code.startswith('8'):
        return 'West Flanders'
    elif postal_code.startswith('9'):
        return 'East Flanders'
    elif postal_code.startswith('3'):
        return 'Flemish Brabant' if int(postal_code) < 3500 else 'Limburg'
    else:
        return None 

df['Province'] = df['Postal_code'].apply(get_province)

df['Province'] = df['Province'].astype(str)
df['Postal_code'] = df['Postal_code'].astype(str)

In [73]:
#creating the main cities:
cities_data = {'City': ['Brussels', 'Antwerp', 'Ghent', 'Bruges', 'Liège','Namur', 'Leuven', 'Mons', 'Aalst', 'Sint-Niklaas'],
               'Latitude': [50.8503, 51.2211, 51.0543, 51.2093, 50.6050, 50.4674, 50.8798, 50.4542, 50.9403, 51.1449],
               'Longitude': [4.3517, 4.4120, 3.7174, 3.2240, 5.5797, 4.8712, 4.7033, 3.9514, 4.0364, 4.1525],
               'Radius': [5 for x in range(10)]}

cities_df = pd.DataFrame(cities_data)

cities_df.head(10)


Unnamed: 0,City,Latitude,Longitude,Radius
0,Brussels,50.8503,4.3517,5
1,Antwerp,51.2211,4.412,5
2,Ghent,51.0543,3.7174,5
3,Bruges,51.2093,3.224,5
4,Liège,50.605,5.5797,5
5,Namur,50.4674,4.8712,5
6,Leuven,50.8798,4.7033,5
7,Mons,50.4542,3.9514,5
8,Aalst,50.9403,4.0364,5
9,Sint-Niklaas,51.1449,4.1525,5


In [74]:
#Make a geodataframe
cities_gdf = gpd.GeoDataFrame(cities_df,geometry=gpd.points_from_xy(cities_df.Longitude, cities_df.Latitude))

In [78]:
#Creating the buffer/radius zone

cities_gdf['geometry'] = cities_gdf.geometry.buffer(cities_gdf['Radius'] / 111)

In [79]:
print(cities_gdf)

           City  Latitude  Longitude  Radius  \
0      Brussels   50.8503     4.3517       5   
1       Antwerp   51.2211     4.4120       5   
2         Ghent   51.0543     3.7174       5   
3        Bruges   51.2093     3.2240       5   
4         Liège   50.6050     5.5797       5   
5         Namur   50.4674     4.8712       5   
6        Leuven   50.8798     4.7033       5   
7          Mons   50.4542     3.9514       5   
8         Aalst   50.9403     4.0364       5   
9  Sint-Niklaas   51.1449     4.1525       5   

                                            geometry  
0  POLYGON ((4.39675 50.8503, 4.39653 50.84588, 4...  
1  POLYGON ((4.45705 51.2211, 4.45683 51.21668, 4...  
2  POLYGON ((3.76245 51.0543, 3.76223 51.04988, 3...  
3  POLYGON ((3.26905 51.2093, 3.26883 51.20488, 3...  
4  POLYGON ((5.62475 50.605, 5.62453 50.60058, 5....  
5  POLYGON ((4.91625 50.4674, 4.91603 50.46298, 4...  
6  POLYGON ((4.74835 50.8798, 4.74813 50.87538, 4...  
7  POLYGON ((3.99645 50.4542, 3

In [80]:
cities_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   City       10 non-null     object  
 1   Latitude   10 non-null     float64 
 2   Longitude  10 non-null     float64 
 3   Radius     10 non-null     int64   
 4   geometry   10 non-null     geometry
dtypes: float64(2), geometry(1), int64(1), object(1)
memory usage: 532.0+ bytes


In [85]:
#checking and slicing original data and creating a new datframe house_geo
house_geo= pd.DataFrame(df[['id', 'latitude', 'longitude']])

house_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         5852 non-null   Int64  
 1   latitude   5847 non-null   float64
 2   longitude  5847 non-null   float64
dtypes: Int64(1), float64(2)
memory usage: 143.0 KB


In [75]:
#SUBTYPE

#Here we will use One-Hot Encoding (pd.get_dummies) because this is a nominal (unordered) column and the number of categories is small.
df_one_hot_subtype = pd.get_dummies(df, columns=['Subtype'], prefix='Subtype', drop_first=True)



In [76]:
df_one_hot_subtype.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   id                       5852 non-null   Int64   
 1   locality_name            5852 non-null   category
 2   Postal_code              5852 non-null   object  
 3   Price                    5852 non-null   Int64   
 4   Number_of_bedrooms       5852 non-null   Int64   
 5   Living_area              5852 non-null   Int64   
 6   street                   5852 non-null   category
 7   number                   5831 non-null   category
 8   latitude                 5847 non-null   float64 
 9   longitude                5847 non-null   float64 
 10  Open_fire                5852 non-null   Int64   
 11  Swimming_Pool            5852 non-null   Int64   
 12  hasTerrace               5852 non-null   Int64   
 13  terraceSurface           5852 non-null   Int64   
 14  gardenSu

In [77]:
#KITCHEN_TYPE
