In [43]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

from features.generic_transformer import MyStandardScaler
from features.genereric_build_features import OneHotEncodeColumns, PostalCodePimp
from utils import save_model_as_pickle

df = pd.read_csv('/home/gg/PycharmProjects/immo-prediction/data/raw/data.csv', low_memory=False)
df.head()
print(df.shape)

sub_types_to_keep = [
    'VILLA', 'HOUSE', 'APARTMENT',
]
columns_to_keep = ['Bathroom Count', 'Bedroom Count', 'Habitable Surface', 'Land Surface', 'Price', 'Subtype',
                   'Latitude', 'Longitude', 'State of Building', 'EPC', 'Kitchen Type']

# Fix some data in the dataframe
df.loc[df['Subtype'] == 'APARTMENT', 'Land Surface'] = 0
df = df.dropna(subset=['Bathroom Count', 'Bedroom Count', 'Habitable Surface', 'Subtype', 'Latitude', 'Longitude',])
print(df.shape)
# df = df[:50000)
df = df.reset_index(drop=True)
df = df[df['Subtype'].isin(sub_types_to_keep)]
print(df.shape)
df = df[columns_to_keep]
df.reset_index(drop=True, inplace=True)
epc_map = {
    "A": 7,
    "B": 6,
    "C": 5,
    "D": 4,
    "E": 3,
    "F": 2,
    "G": 1,
}


def replace_value(x):
    for k, v in epc_map.items():
        if str(k) in str(x):
            return v
    return -1

# the count of each value in EPC column
df['EPC'] = df['EPC'].apply(replace_value)
kitchen_map = {
    "INSTALLED": 1,
    "HYPER_EQUIPPED": 3,
    "SEMI_EQUIPPED": 2,
    "USA_HYPER_EQUIPPED": 3,
    "NOT_INSTALLED": 0,
    "USA_INSTALLED": 1,
    "USA_SEMI_EQUIPPED": 2,
    "USA_UNINSTALLED": 0,
}
df['Kitchen Type'] = df['Kitchen Type'].map(lambda x: kitchen_map.get(x, -1))
state_map = {
    "AS_NEW": 5,
    "JUST_RENOVATED": 4,
    "GOOD": 3,
    "TO_BE_DONE_UP": 2,
    "TO_RENOVATE": 1,
    "TO_RESTORE": 0,
}
df['State of Building'] = df['State of Building'].map(lambda x: state_map.get(x, -1))
df.reset_index(drop=True, inplace=True)

X = df.drop(columns=['Price'])
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41, test_size=0.2)

# training data re-add the price column
training_data = pd.concat([X_train, y_train], axis=1)
training_data.reset_index(drop=True, inplace=True)

# testing data re-add the price column with np.nan
testing_data = X_test.copy()
testing_data['Price'] = np.nan
testing_data.reset_index(drop=True, inplace=True)

ordinal_enc = OrdinalEncoder()
training_data['Subtype'] = ordinal_enc.fit_transform(training_data[['Subtype']]) * 100
testing_data['Subtype'] = ordinal_enc.transform(testing_data[['Subtype']]) * 100

min_max_scaler = MinMaxScaler()
training_data[['Longitude', 'Latitude', 'State of Building', 'EPC', 'Kitchen Type']] = min_max_scaler.fit_transform(
    training_data[['Longitude', 'Latitude', 'State of Building', 'EPC', 'Kitchen Type']])
testing_data[['Longitude', 'Latitude', 'State of Building', 'EPC', 'Kitchen Type']] = min_max_scaler.transform(
    testing_data[['Longitude', 'Latitude', 'State of Building', 'EPC', 'Kitchen Type']])

training_data['Building state price'] = training_data['Price'] / training_data['Habitable Surface']
testing_data['Building state price'] = np.nan

knn_bs_columns = ['Subtype', 'State of Building', 'EPC', 'Kitchen Type', 'Building state price']
knn_bs = KNNImputer(n_neighbors=50)
knn_bs.fit(training_data[knn_bs_columns])
result_bs = knn_bs.transform(testing_data[knn_bs_columns])
result_bs = pd.DataFrame(result_bs, columns=knn_bs_columns)
testing_data['Building state price'] = result_bs['Building state price']
columns_to_keep.append('Building state price')


training_data['Locality Typed Price'] = training_data['Price'] / training_data['Habitable Surface']
# in testing data create a new column with np.nan named 'Locality Typed Price'
testing_data['Locality Typed Price'] = np.nan
print(training_data)
print(testing_data)

# # use KNN to find the closest 5 neighbours
# knn_columns = ['Longitude', 'Latitude', 'Subtype', 'Locality Typed Price']
# knn = KNNImputer(n_neighbors=25)
# knn.fit(training_data[knn_columns])
# result = knn.transform(testing_data[knn_columns])
# # create a new dataframe with the result
# result = pd.DataFrame(result, columns=knn_columns)
# # concatenate the result with the testing data
# testing_data['Locality Typed Price'] = result['Locality Typed Price']
# print(testing_data)
# columns_to_keep.append('Locality Typed Price')



# training_data.drop(columns=['Subtype'], inplace=True)
# testing_data.drop(columns=['Subtype'], inplace=True)

# one hot encode the subtype
# OneHotEncodeColumns = OneHotEncodeColumns(['Subtype'])
# training_data = OneHotEncodeColumns.fit_transform(training_data)
# testing_data = OneHotEncodeColumns.transform(testing_data)


# standard_scaler = MyStandardScaler(columns_to_scale=['Habitable Surface', 'Land Surface', 'Locality Typed Price', 'Building state price'])
# 
# # training data
# training_columns = training_data.columns
# X_training = standard_scaler.fit_transform(training_data)
# training_data = pd.DataFrame(X_training, columns=training_columns)
# 
# # testing data
# testing_columns = testing_data.columns
# testing_data = standard_scaler.transform(testing_data)
# testing_data = pd.DataFrame(testing_data, columns=testing_columns) # error

# get the X_train and y_train
X_train = training_data.drop(columns=['Price'])
y_train = training_data['Price']

# get the X_test and y_test
X_test = testing_data.drop(columns=['Price'])

reg_model = LinearRegression()

reg_model.fit(X_train, y_train)

print(reg_model.score(X_test, y_test))

# 5. Predict the target values for the testing data
y_pred = reg_model.predict(X_test)

# 6. Calculate the MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate the R-squared value
r_squared = r2_score(y_test, y_pred)
print(f'R-squared value: {r_squared:.2%}')


(88859, 45)
(55251, 45)
(45627, 45)
       Bathroom Count  Bedroom Count  Habitable Surface  Land Surface  \
0                 1.0            4.0              180.0         360.0   
1                 1.0            4.0              201.0         290.0   
2                 1.0            5.0              196.0         357.0   
3                 1.0            3.0              103.0           0.0   
4                 1.0            2.0               71.0           0.0   
...               ...            ...                ...           ...   
36496             1.0            4.0              152.0          80.0   
36497             1.0            3.0              172.0         382.0   
36498             1.0            2.0               95.0           0.0   
36499             1.0            3.0              308.0           0.0   
36500             1.0            2.0              104.0           0.0   

       Subtype  Latitude  Longitude  State of Building    EPC  Kitchen Type  \
0       

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [42]:
from matplotlib import pyplot as plt
import seaborn as sns
# print correlation between columns_to_keep for the training data


#sns.heatmap(df[columns_to_keep].corr(), annot=True)
#plt.show()

Bathroom Count       0
Bedroom Count        0
Habitable Surface    0
Land Surface         0
Price                0
Subtype              0
Latitude             0
Longitude            0
State of Building    0
EPC                  0
Kitchen Type         0
dtype: int64