In [42]:
import pandas as pd
import glob
import os
import numpy as np

path = r'../data/curated/merged_dataset/' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016', 'gdp(USD Millioins)'], axis=1, inplace=True)
merged_df['income_per_person'] = np.log(merged_df['income_per_person'])
merged_df['crime_cases'] = np.log(merged_df['crime_cases'])
merged_df['weekly_rent'] = np.log(merged_df['weekly_rent'])
df = merged_df[merged_df.duplicated(subset=["sa2_2021", "nbed", "nbath", "ncar", "residence_type"], keep=False)]
df = merged_df.dropna()



In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

TARGET_COLS = ['weekly_rent']
COLS = list(df.columns)
df = df[COLS]
df = pd.get_dummies(df, columns=['residence_type', 'sa2_2021', 'nbed', 'nbath', 'ncar'])

train, test = train_test_split(df, train_size=0.8, random_state=0)

X_train, y_train = train.drop(TARGET_COLS, axis=1), train[TARGET_COLS]
X_test, y_test = test.drop(TARGET_COLS, axis=1), test[TARGET_COLS]

print(f'{len(X_train)} training instances, {len(X_test)} test instances')

137614 training instances, 34404 test instances


In [44]:
df

Unnamed: 0,year,min_distance_to_cbd,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,min_distance_to_hosp,min_distance_to_poli,min_distance_to_shop,weekly_rent,...,ncar_0,ncar_1,ncar_2,ncar_3,ncar_4,ncar_5,ncar_6,ncar_7,ncar_8,ncar_9
0,2013,227.97163,23.16035,7.35747,16.96507,35.56825,21.35025,22.04660,9.35209,5.703782,...,1,0,0,0,0,0,0,0,0,0
1,2013,223.66084,5.71742,6.50536,6.76794,7.54355,7.42972,6.28177,9.35209,5.370638,...,1,0,0,0,0,0,0,0,0,0
2,2013,243.25680,5.11222,0.20027,36.72106,50.85341,36.63541,0.08478,9.35209,5.164786,...,1,0,0,0,0,0,0,0,0,0
3,2013,140.35827,78.32509,10.66523,11.91899,11.26906,177.44731,84.47341,9.35209,5.857933,...,1,0,0,0,0,0,0,0,0,0
4,2013,13.86135,0.93250,1.32931,3.49174,2.20800,177.44731,84.47341,3.96501,5.616771,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,293.28053,0.56012,1.21809,114.77016,90.08591,140.56888,74.35608,13.64920,5.579730,...,0,1,0,0,0,0,0,0,0,0
172031,2022,258.29111,3.49087,5.08707,3.60570,8.37185,2.60312,74.35608,13.64920,6.214608,...,0,0,1,0,0,0,0,0,0,0
172032,2022,9.47077,2.45011,1.33931,1.62322,3.63291,140.56888,74.35608,1.97636,6.620073,...,0,1,0,0,0,0,0,0,0,0
172033,2022,1.84933,0.65199,1.10438,1.27940,1.87840,140.56888,74.35608,13.64920,6.013715,...,0,1,0,0,0,0,0,0,0,0


In [45]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Normalization
# from keras.wrappers.scikit_learn import KerasRegressor # Deprecated
from sklearn.ensemble import AdaBoostRegressor
import scikeras
from scikeras.wrappers import KerasRegressor

In [46]:
def simple_model():
    """Base neural network model for AdaBoosting"""
    norm_layer = Normalization()
    norm_layer.adapt(X_train)
    model = keras.Sequential(
        [   
            norm_layer,                   # our normalisation layer recieves the input
            Dense(80, activation='relu'),  # the hidden layer gets the normalised result
            Dense(80, activation='relu'),  # (in case you want to try an extra hidden layer)
            Dense(1, activation='relu')
        ]
    )
    model.compile(
        optimizer='adam',  # Adam optimises using gradient descent, is generally fast and a good choice in many cases
        loss='MSE' # 'sparse_categorical_crossentropy'#'MSE'  # Mean Squared Error makes sense for this problem, 
                    # though we could use Mean Absolute Error, or many other choices.
                    # Classification outputs would use a different loss (eg. BinaryCrossentropy)
    )
    return model


# history = model.fit(
#     x=X_train,
#     y=y_train,
#     batch_size=16,
#     validation_split=0.25,
#     epochs=10
# )

In [47]:
ann_estimator = KerasRegressor(model=simple_model, epochs=10, batch_size=10, verbose=0)
boosted_ann = AdaBoostRegressor(base_estimator= ann_estimator)
boosted_ann.fit(X_train, y_train.values.ravel())# scale your training data 

In [None]:
# predictions = boosted_ann.predict(X_test.iloc[:])
# errors = np.array(predictions - y_test.iloc[:])
# squared_errors = errors**2
# mean_squared_error = squared_errors.mean()
# 
# print(f'MSE: {mean_squared_error}')
# tot_sum_squares = (np.array(y_test - y_test.mean())**2).sum()
# r2 = 1 - (squared_errors.sum() / tot_sum_squares)
# print(f'Model R^2: {r2:.4f}')

In [None]:
predictions = boosted_ann.predict(X_test.iloc[:])
errors = predictions - np.array(y_test.iloc[:])
squared_errors = errors**2
mean_squared_error = squared_errors.mean()

print(f'MSE: {mean_squared_error}')
tot_sum_squares = (np.array(y_test - y_test.mean())**2).sum()
r2 = 1 - (squared_errors.sum() / tot_sum_squares)
print(f'Model R^2: {r2:.4f}')

MSE: 3287598.945886227
Model R^2: -758781499232.5344
