In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

from features.generic_transformer import MyStandardScaler
from features.genereric_build_features import OneHotEncodeColumns, PostalCodePimp
from utils import save_model_as_pickle

df = pd.read_csv('/home/gg/PycharmProjects/immo-prediction/data/raw/data.csv', low_memory=False)
df.head()

sub_types_to_keep = [
    'VILLA', 'HOUSE', 'APARTMENT',
]
columns_to_keep = ['Bathroom Count', 'Bedroom Count', 'Habitable Surface', 'Land Surface', 'Price', 'Subtype',
                   'Latitude', 'Longitude', 'State of Building', 'EPC', 'Kitchen Type']

# Fix some data in the dataframe
df.loc[df['Subtype'] == 'APARTMENT', 'Land Surface'] = 0
df = df.dropna(subset=['Bathroom Count', 'Bedroom Count', 'Habitable Surface', 'Latitude', 'Longitude', 'Subtype'])
# df = df[:50000)
df = df.reset_index(drop=True)
df = df[df['Subtype'].isin(sub_types_to_keep)]
df = df[columns_to_keep]
df.reset_index(drop=True, inplace=True)
epc_map = {
    "A": 7,
    "B": 6,
    "C": 5,
    "D": 4,
    "E": 3,
    "F": 2,
    "G": 1,
}
# the count of each value in EPC column
df['EPC'] = df['EPC'].apply(lambda x: 'A' if 'A' in x else x)
df['EPC'] = df['EPC'].apply(lambda x: 'F' if 'F' in x else x)
df['EPC'] = df['EPC'].map(epc_map)
kitchen_map = {
    "INSTALLED": 1,
    "HYPER_EQUIPPED": 3,
    "SEMI_EQUIPPED": 2,
    "USA_HYPER_EQUIPPED": 3,
    "NOT_INSTALLED": 0,
    "USA_INSTALLED": 1,
    "USA_SEMI_EQUIPPED": 2,
    "USA_UNINSTALLED": 0,
}
df['Kitchen Type'] = df['Kitchen Type'].map(kitchen_map)
state_map = {
    "AS_NEW": 5,
    "JUST_RENOVATED": 4,
    "GOOD": 3,
    "TO_BE_DONE_UP": 2,
    "TO_RENOVATE": 1,
    "TO_RESTORE": 0,
}
df['State of Building'] = df['State of Building'].map(state_map)
df.reset_index(drop=True, inplace=True)

# display the percentage of missing values for each column
missing_percent = df.isnull().sum() / df.shape[0]
missing_percent

TypeError: argument of type 'float' is not iterable

In [5]:
train_data

Unnamed: 0,Bathroom Count,Bedroom Count,Habitable Surface,Land Surface,Facades,Subtype_APARTMENT,Subtype_DUPLEX,Subtype_FLAT_STUDIO,Subtype_GROUND_FLOOR,Subtype_HOUSE,Subtype_MANSION,Subtype_PENTHOUSE,Subtype_TOWN_HOUSE,Subtype_VILLA
78201,0.0,0.102564,0.016580,0.003297,0.065217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17226,0.0,0.076923,0.013940,0.005189,0.065217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
54896,0.0,0.051282,0.011194,0.000000,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50091,0.0,0.051282,0.006336,0.000000,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23465,0.0,0.076923,0.023973,0.003067,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56712,,0.076923,0.013729,0.000000,0.043478,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
59733,,0.076923,0.019537,0.003098,0.043478,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
68324,0.0,0.025641,0.003907,0.000000,0.021739,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1078,0.0,0.102564,0.015841,0.005823,0.065217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
missing_percent = train_data.isnull().sum() / X_train.shape[0]
missing_percent

Bathroom Count          0.123317
Bedroom Count           0.000000
Habitable Surface       0.000000
Land Surface            0.086662
Facades                 0.320885
Subtype_APARTMENT       0.000000
Subtype_DUPLEX          0.000000
Subtype_FLAT_STUDIO     0.000000
Subtype_GROUND_FLOOR    0.000000
Subtype_HOUSE           0.000000
Subtype_MANSION         0.000000
Subtype_PENTHOUSE       0.000000
Subtype_TOWN_HOUSE      0.000000
Subtype_VILLA           0.000000
dtype: float64