# ImmoEliza Data Analysis

## Data Cleaning

### Import Necessary Librairies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pd.options.mode.chained_assignment = None

### Setting the dataset in a variable called "data"

In [None]:
data = pd.read_json("final_dataset.json")
pd.options.display.max_columns = 35             #just extending the number  of columns that can be seen
pd.options.display.max_colwidth = 120           #just extending the width of columns to be able to click on Url for checking infos
data.shape

### Checking  if there is any duplicates

In [None]:
data.drop_duplicates("PropertyId",inplace=True)
data.shape



### Seems not let's move on to the next step which  is checking null values

In [None]:
data.isnull().sum()

### Get rid of null values when possible

In [None]:
data.update(data[["BathroomCount","Fireplace","Furnished","Garden","GardenArea","SwimmingPool","Terrace","ToiletCount"]].fillna(0))
data.shape

### Get rid of potentials encoding errors

In [None]:
data.drop(data[data.BathroomCount > data.BedroomCount].index,inplace=True)
data.drop(data[data.ConstructionYear > 2033].index,inplace=True)
data.drop(data[data.GardenArea > data.SurfaceOfPlot].index,inplace=True)
data.drop(data[data.PostalCode < 1000].index,inplace=True)
data.drop(data[data.NumberOfFacades > 4].index,inplace=True)
data.drop(data[data.Price > 15000000].index,inplace=True)
data.drop(data[data.ToiletCount > 58].index,inplace=True)
data.drop(data[data.ShowerCount > 58].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_monthly_amount"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_without_lump_sum"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_lump_sum"].index,inplace=True)


data.shape

### Super looks like we deleted more than 4000 errors. Great!

### Separate sale data and rent data to express more things

In [None]:
sale_data = data[data.TypeOfSale == "residential_sale"]
rent_data = data[data.TypeOfSale == "residential_monthly_rent"]
print(sale_data.shape)
print(rent_data.shape)

## Data Analysis

### See how many rows and columns we have

In [None]:
print(sale_data.shape)
print(rent_data.shape)

### See correlation between price and place

#### price/region

In [None]:
price_sale_region = sale_data.groupby("Region", as_index=False)[["Price"]].mean().round()
price_rent_region = rent_data.groupby("Region", as_index=False)[["Price"]].mean().round()

In [None]:
fig = go.Figure()
colors = px.colors.qualitative.Vivid
fig.add_trace(
    go.Bar(
        x=price_sale_region['Region'],
        y=price_sale_region['Price'],
        name='Sale',
        offsetgroup=1,
        marker=dict(
            color=colors[0]
        )
    )
)
fig.add_trace(
    go.Bar(
        x=price_rent_region['Region'],
        y=price_rent_region['Price'],
        name='Rent',
        yaxis='y2',
        offsetgroup=2,
        marker=dict(
            color=colors[1]
        )
    )
)
fig.update_traces(
    hovertemplate='<b>%{y:,.0f} €</b>'
)
buttons=[
    dict(
        label="Rent",
        method="update",
        args=[{},dict(
            xaxis=dict(
                categoryarray=price_rent_region['Region']
            )   
        )]
    ),
    dict(
        label="Sale",
        method="update",
        args=[{},dict(
            xaxis=dict(
                categoryarray=price_sale_region['Region']
            )   
        )]
    )
]
fig.update_layout(
    title='Average price per region',
    width = 800,
    updatemenus=[dict(
        type='buttons',
        buttons=buttons,
        bgcolor=colors[-1]
    )],
    legend=dict(
        x=0.5,
        y=1.1,
        orientation='h',
        xanchor='center'
    ),
    yaxis=dict(
        gridcolor=colors[0],
        color=colors[0],
        title='Sale price in €'
    ),
    yaxis2=dict(
        overlaying='y',
        side='right',
        gridcolor=colors[1],
        color=colors[1],
        title='Monthly rent in €'
    ),
        xaxis=dict(
        categoryorder='array',
    ),
    bargap=0.1,
    bargroupgap=0.05,
    hovermode='x unified',
)
fig.show()

#### price/province

In [None]:
price_sale_province = sale_data.groupby("Province", as_index=False)[["Price"]].mean().round().sort_values("Price",ascending=False)
price_rent_province = rent_data.groupby("Province", as_index=False)[["Price"]].mean().round().sort_values("Price",ascending=False)

In [None]:
fig = go.Figure()
colors = px.colors.qualitative.Vivid
fig.add_trace(
    go.Bar(
        x=price_sale_province['Province'],
        y=price_sale_province['Price'],
        name='Sale',
        offsetgroup=1,
        marker=dict(
            color=colors[0]
        )
    )
)
fig.add_trace(
    go.Bar(
        x=price_rent_province['Province'],
        y=price_rent_province['Price'],
        name='Rent',
        yaxis='y2',
        offsetgroup=2,
        marker=dict(
            color=colors[1]
        )
    )
)
fig.update_traces(
    hovertemplate='<b>%{y:,.0f} €</b>',   
)
buttons=[
    dict(
        label="Sale",
        method="update",
        args=[{},dict(
            xaxis=dict(
                categoryarray=price_sale_province['Province']
            )   
        )]
    ),
    dict(
        label="Rent",
        method="update",
        args=[{},dict(
            xaxis=dict(
                categoryarray=price_rent_province['Province']
            )   
        )]
    )
]
fig.update_layout(
    title='Average price per province',
    width = 800,
    updatemenus=[dict(
        buttons=buttons
    )],
    legend=dict(
        x=0.5,
        y=1.1,
        orientation='h',
        xanchor='center'
    ),
    yaxis=dict(
        gridcolor=colors[0],
        color=colors[0],
        title='Sale price in €'
    ),
    yaxis2=dict(
        overlaying='y',
        side='right',
        gridcolor=colors[1],
        color=colors[1],
        title='Monthly rent in €'
    ),
    xaxis=dict(
        categoryorder='array',
    ),
    bargap=0.1,
    bargroupgap=0.05,
    hovermode='x unified',
)
fig.show()

#### Price/District

In [None]:
price_sale_district = sale_data.groupby("District", as_index=False)[["Price"]].mean().round().sort_values("Price",ascending=False)
price_rent_district = rent_data.groupby("District", as_index=False)[["Price"]].mean().round().sort_values("Price",ascending=False)

In [None]:
fig = go.Figure()
colors = px.colors.qualitative.Vivid
fig.add_trace(
    go.Bar(
        x=price_sale_district['District'],
        y=price_sale_district['Price'],
        name='Sale',
        offsetgroup=1,
        marker=dict(
            color=colors[0]
        )
    )
)
fig.add_trace(
    go.Bar(
        x=price_rent_district['District'],
        y=price_rent_district['Price'],
        name='Rent',
        yaxis='y2',
        offsetgroup=2,
        marker=dict(
            color=colors[1]
        )
    )
)
fig.update_traces(
    hovertemplate='<b>%{y:,.0f} €</b>',   
)
buttons=[
    dict(
        label="Sale",
        method="update",
        args=[{},dict(
            xaxis=dict(
                categoryarray=price_sale_district['District']
            )   
        )]
    ),
    dict(
        label="Rent",
        method="update",
        args=[{},dict(
            xaxis=dict(
                categoryarray=price_rent_district['District']
            )   
        )]
    )
]
fig.update_layout(
    title='Average price per district',
    width = 800,
    updatemenus=[dict(
        buttons=buttons
    )],
    legend=dict(
        x=0.5,
        y=1.15,
        orientation='h',
        xanchor='center'
    ),
    yaxis=dict(
        gridcolor=colors[0],
        color=colors[0],
        title='Sale price in €'
    ),
    yaxis2=dict(
        overlaying='y',
        side='right',
        gridcolor=colors[1],
        color=colors[1],
        title='Monthly rent in €'
    ),
    xaxis=dict(
        categoryorder='array',
    ),
    bargap=0.1,
    bargroupgap=0.05,
    hovermode='x unified',
)
fig.show()

### Price per m2

#### Per Region

In [None]:
region_price = sale_data.groupby("Region")[["Price"]].sum()
region_livingarea = sale_data.groupby("Region")[["LivingArea"]].sum()
region_m2_price = pd.merge(region_price,region_livingarea,on="Region")
region_m2_price["€/m2"] = region_m2_price["Price"] / region_m2_price["LivingArea"]
region_m2_price = region_m2_price.reset_index()
region_m2_price.plot.bar(x="Region",y="€/m2")

#### Per Province

In [None]:
province_price = sale_data.groupby("Province",as_index=False)[["Price"]].sum()
province_livingarea = sale_data.groupby("Province",as_index=False)[["LivingArea"]].sum()
province_m2_price = pd.merge(province_price,province_livingarea,on="Province")
province_m2_price["€/m2"] = province_m2_price["Price"] / province_m2_price["LivingArea"]
province_m2_price.plot.bar(x="Province",y="€/m2")

#### Per District

In [None]:
district_price = sale_data.groupby("District",as_index=False)[["Price"]].sum()
district_livingarea = sale_data.groupby("District",as_index=False)[["LivingArea"]].sum()
district_m2_price = pd.merge(district_price,district_livingarea,on="District")
district_m2_price["€/m2"] = district_m2_price["Price"] / district_m2_price["LivingArea"]
district_m2_price.plot.bar(x="District",y="€/m2")

In [None]:
sale_data.head(1)

In [None]:
sale_data.StateOfBuilding.value_counts()

In [None]:
stateofbuilding_dict = {"StateOfBuilding":{"TO_BE_DONE_UP": 1, "TO_RESTORE" : 2, "TO_RENOVATE" : 3, "GOOD" : 4, "JUST_RENOVATED" : 5, "AS_NEW" : 6}}
sale_data.replace(stateofbuilding_dict, inplace=True)
peb_sob = sale_data.groupby("PEB", as_index=False)[["StateOfBuilding"]].mean()
peb_sob.dropna(inplace=True)
peb_sob.sort_values("StateOfBuilding",ascending=False)
peb_sob.plot.bar(x="PEB",y="StateOfBuilding")

In [None]:
import plotly.figure_factory as ff
import chart_studio.plotly as py
import plotly.tools as tls
import pandas as pd

In [None]:
values = district_price['Price'].tolist()
district = district_price['District'].to_list()

In [None]:
# ff.create_choropleth(
    
#     district = district,
#     values = values,
#     colorscale = "Viridis",
#     round_legend_values = True,
#     simplify_county = 0,
#     simplify_state = 0,
#     county_outline = {'color': 'rgb(15, 15, 55)', 'width': 0.5},
#     legend_title = 'Mean Price per District',
#     title = 'Belgium'
# )

In [None]:
district_price = sale_data.groupby("District",as_index=False)[["Price"]].sum()
district_livingarea = sale_data.groupby("District",as_index=False)[["LivingArea"]].sum()
district_m2_price = pd.merge(district_price,district_livingarea,on="District")
district_m2_price["€/m2"] = district_m2_price["Price"] / district_m2_price["LivingArea"]
district_m2_price.plot.bar(x="District",y="€/m2")

In [None]:
import pandas as pd
from dython.nominal import associations

In [None]:
from dython.nominal import identify_nominal_columns
categorical_features=identify_nominal_columns(sale_data)
categorical_features

In [None]:
sale_data_wo_url = sale_data.iloc[:,1:]

sale_data_wo_url

In [None]:
# assoc_results = associations(sale_data_wo_url, nominal_columns='auto', numerical_columns=None,
#                              mark_columns=False, nom_nom_assoc='cramer', num_num_assoc='pearson',
#                              ax=None, figsize=None, annot=True, fmt='.2f', cmap=None, sv_color='silver',
#                              cbar=True, vmax=1.0, vmin=None, plot=True, compute_only=False, clustering=False, title=None, filename=None)

# # Display the associations
# plt.figure(figsize=(30, 30))

# plt.show()


In [None]:
# complete_correlation= associations(sale_data_wo_url, filename= 'complete_correlation.png', figsize=(30,30))
# df_complete_corr=complete_correlation['corr']
# df_complete_corr.dropna(axis=1, how='all').dropna(axis=0, how='all').style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
sale_data['FloodingZone'].value_counts()


In [None]:
sale_data.info()

In [None]:
test = sale_data.drop(['MonthlyCharges'], axis=1)
test = test.drop(['Country'], axis=1)
test = test.drop(['Url'], axis=1)
test = test.drop(['TypeOfSale'], axis=1)


In [None]:
test.dropna()
test.info()

In [None]:
test.head()


In [None]:
test_corr_house = test.loc[test['TypeOfProperty'] == 1]
test_corr_appt = test.loc[test['TypeOfProperty'] == 2]



In [None]:
test_corr_house = test_corr_house.drop(['TypeOfProperty'], axis=1)
test_corr_appt = test_corr_appt.drop(['TypeOfProperty'], axis=1)
test_corr_appt = test_corr_appt.drop(['SurfaceOfPlot'], axis=1)


In [None]:
test_corr_house.info()

In [None]:
# House
complete_correlation = associations(test_corr_house, filename= 'complete_correlation_house.png', figsize=(30,30))
df_complete_corr = complete_correlation['corr']
df_complete_corr.dropna(axis=1, how='all').dropna(axis=0, how='all').style.background_gradient(cmap='coolwarm', axis=None)


In [None]:
# Appt
complete_correlation = associations(test_corr_appt, filename= 'complete_correlation_appt.png', figsize=(30,30))
df_complete_corr = complete_correlation['corr']
df_complete_corr.dropna(axis=1, how='all').dropna(axis=0, how='all').style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
sale_data['Kitchen'].value_counts()

In [None]:
district_price_test = test_corr_house.groupby("District",as_index=False)[["Price"]].sum()
district_livingarea_test = test_corr_house.groupby("District",as_index=False)[["LivingArea"]].sum()
district_m2_price_test = pd.merge(district_price_test,district_livingarea_test,on="District")
district_m2_price_test["€/m2"] = district_m2_price_test["Price"] / district_m2_price_test["LivingArea"]
district_m2_price_test.plot.bar(x="District",y="€/m2")

In [None]:
district_m2_price_test.head(10)

In [None]:
import pandas as pd
import plotly.express as px

# Créer la carte choroplèthe
fig = px.choropleth(data_frame=district_m2_price_test,
                    geojson="data_geo/belgium-with-regions_.geojson",
                    locations="District",
                    featureidkey="properties.name",
                    color="€/m2",
                    hover_name="District",
                    hover_data=["€/m2"],
                    title="Square meter price in Belgium disctrict (€/m²)",
                    color_continuous_scale="Viridis")

# Mettre à jour la géométrie de la carte
fig.update_geos(fitbounds="locations", visible=False)

# Afficher la carte
fig.show()


In [None]:
import pandas as pd
import plotly.express as px

# Créer la carte choroplèthe
fig = px.choropleth(data_frame=district_m2_price_test,
                    geojson="data_geo/",
                    locations="District",
                    featureidkey="properties.name",
                    color="€/m2",
                    hover_name="District",
                    hover_data=["€/m2"],
                    title="Prix au mètre carré par district en Belgique",
                    color_continuous_scale="Viridis")

# Mettre à jour la géométrie de la carte
fig.update_geos(fitbounds="locations", visible=False)

# Afficher la carte
fig.show()
