In [40]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.neural_network import MLPRegressor
import pandas as pd
from statsmodels.formula.api import ols
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from statsmodels.stats.api import anova_lm

In [18]:
df = pd.read_csv("../data/curated/outlier_removed_data.csv")

In [19]:
df = df[['postcode', 'property_type', 'num_beds', 'num_baths', 'num_parking', 'school_duration', 'school_distance', 'park_duration', 'park_distance', 'shop_duration', 'shop_distance', 'weekly_rent']]

In [42]:
t = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(), ['postcode', 'property_type', 'num_beds', 'num_baths', 'num_parking']),
    ('scale', StandardScaler(), ['school_duration', 'school_distance', 'park_duration', 'park_distance', 'shop_duration', 'shop_distance'])
], remainder='passthrough')  # Default is to drop untransformed columns

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:-1], df.iloc[:,-1], test_size=0.33, random_state=42)
X_train_transformed = t.fit_transform(X_train)
X_test_transformed = t.fit_transform(X_test)


In [87]:
nn_model = MLPRegressor(max_iter=1000, activation='logistic',
                        n_iter_no_change=15, random_state=4).fit(X_train_transformed, y_train)




In [89]:
print(nn_model.score(X_train_transformed, y_train))
mean_absolute_error(nn_model.predict(
    X_train_transformed), y_train)

0.8058969916642279


50.22528547829401

In [90]:
lm = ols(
    "weekly_rent ~  C(postcode)+ C(property_type)+ C(num_beds)+ C(num_baths)+ C(num_parking) + school_duration+ school_distance + park_duration+ park_distance+ shop_duration+ shop_distance", data=df).fit()
anova_lm(lm)


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(postcode),412.0,138478300.0,336112.3,21.686364,0.0
C(property_type),13.0,66660900.0,5127761.0,330.849271,0.0
C(num_beds),10.0,87539320.0,8753932.0,564.814154,0.0
C(num_baths),5.0,24964660.0,4992931.0,322.149875,0.0
C(num_parking),9.0,2335868.0,259540.9,16.745888,9.701631e-28
school_duration,1.0,90085.96,90085.96,5.812454,0.01592831
school_distance,1.0,14054.83,14054.83,0.906834,0.3409754
park_duration,1.0,120.3782,120.3782,0.007767,0.9297746
park_distance,1.0,27.42025,27.42025,0.001769,0.9664502
shop_duration,1.0,6.268732,6.268732,0.000404,0.9839549


In [91]:
mean_absolute_error(lm.predict(X_test).fillna(0), y_test)

140.0890323992437