In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from features.genereric_build_features import OneHotEncodeColumns
from features.generic_transformer import MyKnnImputer, MyMinMaxScaler, ColumnSelector, RowSelectorByCategory, \
    DataFrameStandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

df = pd.read_csv('/home/gg/PycharmProjects/immo-prediction/data/raw/data.csv', low_memory=False)
df.head()

df.loc[df['Subtype'] == 'APARTMENT', 'Land Surface'] = 0

X = df.drop(columns=['Price'])
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41, test_size=0.2)

train_data = pd.concat([X_train, y_train], axis=1)

keep_columns = ['Bathroom Count', 'Bedroom Count', 'Habitable Surface', 'Land Surface', 'Facades', 'Subtype',]
sub_types_to_keep = [
    'VILLA', 'HOUSE', 'APARTMENT', 'MANSION', 'PENTHOUSE', 'TOWN_HOUSE', 'GROUND_FLOOR', 'FLAT_STUDIO', 'DUPLEX',
]

pipeline = Pipeline([
    ("Column keeper", ColumnSelector(keep_columns=keep_columns)),
    ("Keep categories", RowSelectorByCategory(column='Subtype', categories_to_keep=sub_types_to_keep)),
    ("One hot encode subtypes", OneHotEncodeColumns(['Subtype'])),
    ("Standardize data", DataFrameStandardScaler()),
    # ("Min Max scaler", MinMaxScaler()),
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [3]:
missing_percent = X_train.isnull().sum() / X_train.shape[0]
missing_percent

Bathroom Count          0.149291
Bedroom Count           0.000000
Habitable Surface       0.095814
Land Surface            0.090568
Facades                 0.358817
Subtype_APARTMENT       0.000000
Subtype_DUPLEX          0.000000
Subtype_FLAT_STUDIO     0.000000
Subtype_GROUND_FLOOR    0.000000
Subtype_HOUSE           0.000000
Subtype_MANSION         0.000000
Subtype_PENTHOUSE       0.000000
Subtype_TOWN_HOUSE      0.000000
Subtype_VILLA           0.000000
dtype: float64