In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from transformers import LogTransformer, SquareRootTransformer, ZScoreTrimmer, UpperBoundTrimmer, LowerBoundTrimmer, FillNaColumnTransformer, KNNColumnImputer, OneHotColumnEncoder, OrdinalColumnEncoder, ColumnDropper, StandardColumnScaler, ClusterSimilarity


In [None]:
df = pd.read_csv("data/belgian_property_prices.csv")

In [None]:
features_to_drop = ["external_reference", "latest_land_use_designation", "website", "reference_number_of_the_epc_report", "street", "housenumber", "city", "postal", "state"]

In [None]:
numerical_features = [
    "bathrooms",
    "bedroom_1_surface",
    "bedroom_2_surface",
    "bedroom_3_surface",
    "bedrooms",
    "co2_emission",
    "cadastral_income",
    "construction_year",
    "covered_parking_spaces",
    "garden_surface",
    "kitchen_surface",
    "living_area",
    "living_room_surface",
    "outdoor_parking_spaces",
    "primary_energy_consumption",
    "street_frontage_width",
    "surface_of_the_plot",
    "toilets",
    "width_of_the_lot_on_the_street",
    "yearly_theoretical_total_energy_consumption",
    "lat",
    "lng"
    ]

In [None]:
categorical_features = [
    "as_built_plan",
    "basement",
    "connection_to_sewer_network",
    "dining_room",
    "double_glazing",
    "flood_zone_type",
    "furnished",
    "gas_water__electricity",
    "office",
    "planning_permission_obtained",
    "possible_priority_purchase_right",
    "proceedings_for_breach_of_planning_regulations",
    "subdivision_permit",
    "surroundings_type",
    "tv_cable",
    "tenement_building",
    "available_as_of",
    "building_condition",
    "number_of_frontages",
    "energy_class",
    "planning_permission_obtained", 
    "heating_type"
    ]

In [None]:
df["price_cat"] = pd.cut(df.price, bins=[0,250000,500000,750000,1000000,np.inf], labels=[0,1,2,3,4])
train_set, test_set = train_test_split(df, test_size=0.2, stratify=df.price_cat, random_state=42)

for set_ in (train_set, test_set):
    set_.drop("price_cat", axis=1, inplace=True)

In [None]:
pipeline = Pipeline([
    ("column_dropping", ColumnDropper(columns=features_to_drop)),
    ("log_transforms", LogTransformer(columns=[
        "co2_emission",
        "cadastral_income",
        "garden_surface",
        "kitchen_surface",
        "living_area",
        "living_room_surface",
        "street_frontage_width",
        "surface_of_the_plot",
        "width_of_the_lot_on_the_street", 
        "price"
    ])),
    ("square_root_transforms", SquareRootTransformer(columns=[
        "bathrooms",
        "bedroom_1_surface",
        "primary_energy_consumption",
        "yearly_theoretical_total_energy_consumption"
    ])),
    ("zscore_trimming", ZScoreTrimmer(columns=[
        "living_room_surface",
        "primary_energy_consumption",
        "yearly_theoretical_total_energy_consumption"
    ])),                                           
    ("upperbound_trimming", UpperBoundTrimmer(column_boundaries={
        "bathrooms": 3,
        "bedroom_1_surface":8,
        "bedroom_2_surface":40,
        "bedroom_3_surface":40,
        "bedrooms":10,
        "co2_emission": 10,
        "covered_parking_spaces": 10,
        "garden_surface":11,
        "kitchen_surface":4,
        "outdoor_parking_spaces":20,
        "street_frontage_width": 4,
        "toilets":10,
        "width_of_the_lot_on_the_street":5,
        "number_of_frontages":4
    })),
     ("lowerbound_trimming", LowerBoundTrimmer(column_boundaries={
        "co2_emission":0,
        "cadastral_income":4,
        "garden_surface":0,
        "kitchen_surface":1,
        "living_area":3,
        "street_frontage_width":0,
        "surface_of_the_plot":0,
        "width_of_the_lot_on_the_street": 0,
        "number_of_frontages":1,
        "price": 11
     })), 
    ('filling na', FillNaColumnTransformer(fill_value='Missing', columns=[
        "surroundings_type",
        "available_as_of",
        "building_condition",
        "kitchen_type",
        "energy_class", 
        "heating_type"
    ])),
    ('filling na2', FillNaColumnTransformer(fill_value=0, columns=categorical_features)),     
     ("one_hot_encoding", OneHotColumnEncoder(columns=[
        "surroundings_type",
        "available_as_of",
        "building_condition",
        "number_of_frontages",
        "kitchen_type",
        "heating_type",
    ])),
    ("ordinal_encoding", OrdinalColumnEncoder(columns=[
        "energy_class",
        ])),
    ("KNN_imputing", KNNColumnImputer(n_neighbors=5,columns=numerical_features)), 
    ("standard_scaling", StandardColumnScaler(columns=numerical_features))                                                    
])

In [None]:

prepared_train_set = data=pipeline.fit_transform(train_set)

In [None]:
len(prepared_train_set)

In [None]:
prepared_train_set.describe()

In [None]:
prepared_train_set.info()

In [None]:
subplot_size = (int(np.ceil(len(numerical_features)/6)), 6)
fig, axs = plt.subplots(subplot_size[0], subplot_size[1], figsize=(30,13), gridspec_kw={'hspace': 0.5})

for axis_index, ax in enumerate(axs.flat):
    if axis_index == len(numerical_features):
        break
    ax.hist(prepared_train_set[numerical_features[axis_index]].values, bins=50)
    ax.set_title([numerical_features[axis_index]])

In [None]:
import seaborn as sns
%matplotlib inline
correlation_features = numerical_features + ["price"]
print(correlation_features)
corr = prepared_train_set[correlation_features].corr()
sns.heatmap(corr)

In [None]:
prepared_train_set.reset_index(inplace=True, drop=True)
prepared_train_set.info()