# Neural Network Modelling
- considering log model
- use R-squared to measure the model performance (for continous response variable)

In [61]:
import pandas as pd
import glob
import os
import numpy as np

path = r'../data/curated/merged_dataset/' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in sorted(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

merged_df = pd.concat(li, axis=0, ignore_index=True)
# log candidates: income_per_person, population_density, crime_cases, weekly_rent
merged_df['income_per_person'] = np.log(merged_df['income_per_person'])
merged_df['crime_cases'] = np.log(merged_df['crime_cases'])
merged_df['weekly_rent'] = np.log(merged_df['weekly_rent'])
merged_df.drop(['address', 'latitude', 'longitude', 'postcode', 'sa2_2016'], axis=1, inplace=True)


In [62]:
# Example of solo SA2 instance
df = merged_df.dropna()
df.loc[df['sa2_2021']==206061516]

Unnamed: 0,year,sa2_2021,residence_type,nbed,nbath,ncar,min_distance_to_cbd,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,min_distance_to_hosp,min_distance_to_poli,min_distance_to_shop,weekly_rent,gdp(USD Millioins),saving_rate(% of GDP),income_per_person,population_density,crime_cases
132662,2022,206061516,Apartment,3.0,1.0,1,6.72465,0.96823,1.43268,114.77016,0.60096,140.56888,74.35608,13.6492,6.39693,3305754,12.839,11.511549,7698.0,6.660575


In [63]:
#df.iloc[132647:]
# drop the SA2 only has one instance
# lost 3 instances
df = df[df.duplicated(subset=["sa2_2021", "nbed", "nbath", "ncar", "residence_type"], keep=False)]
df = df.drop(columns=['gdp(USD Millioins)'])
df = df.dropna()
df

Unnamed: 0,year,sa2_2021,residence_type,nbed,nbath,ncar,min_distance_to_cbd,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,min_distance_to_hosp,min_distance_to_poli,min_distance_to_shop,weekly_rent,saving_rate(% of GDP),income_per_person,population_density,crime_cases
0,2013,204011057,House,2.0,1.0,0,227.97163,23.16035,7.35747,16.96507,35.56825,21.35025,22.04660,9.35209,5.703782,6.861393,10.588692,2.172408,4.454347
1,2013,205051101,House,2.0,1.0,0,223.66084,5.71742,6.50536,6.76794,7.54355,7.42972,6.28177,9.35209,5.370638,6.861393,10.762630,5.425503,3.583519
2,2013,204011057,House,2.0,1.0,0,243.25680,5.11222,0.20027,36.72106,50.85341,36.63541,0.08478,9.35209,5.164786,6.861393,10.588692,2.172408,4.454347
3,2013,202011022,House,4.0,2.0,0,140.35827,78.32509,10.66523,11.91899,11.26906,177.44731,84.47341,9.35209,5.857933,6.861393,10.681809,473.765281,7.160846
4,2013,208041195,Apartment,1.0,1.0,0,13.86135,0.93250,1.32931,3.49174,2.20800,177.44731,84.47341,3.96501,5.616771,6.861393,11.363304,2834.210526,7.561642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172030,2022,205021086,House,3.0,1.0,1,293.28053,0.56012,1.21809,114.77016,90.08591,140.56888,74.35608,13.64920,5.579730,12.839000,10.903481,402.000000,5.638355
172031,2022,217041479,House,3.0,2.0,2,258.29111,3.49087,5.08707,3.60570,8.37185,2.60312,74.35608,13.64920,6.214608,12.839000,11.015813,689.000000,8.022569
172032,2022,208021177,House,2.0,2.0,1,9.47077,2.45011,1.33931,1.62322,3.63291,140.56888,74.35608,1.97636,6.620073,12.839000,11.500412,3656.000000,6.632002
172033,2022,206041506,Apartment,1.0,1.0,1,1.84933,0.65199,1.10438,1.27940,1.87840,140.56888,74.35608,13.64920,6.013715,12.839000,11.174728,5791.000000,7.488853


In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

Split traning and test instances

In [65]:
TARGET_COLS = ['weekly_rent']
COLS = list(df.columns)
df = df[COLS]
df = pd.get_dummies(df, columns=['residence_type', 'sa2_2021', 'nbed', 'nbath', 'ncar'])

train, test = train_test_split(df, train_size=0.8, random_state=0)

X_train, y_train = train.drop(TARGET_COLS, axis=1), train[TARGET_COLS]
X_test, y_test = test.drop(TARGET_COLS, axis=1), test[TARGET_COLS]

print(f'{len(X_train)} training instances, {len(X_test)} test instances')

135561 training instances, 33891 test instances


In [66]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.layers import Dense, Normalization

In [67]:
# Setup a normalization layer and adapt it to the training set so that it knows
# what mean and sd to use when normalising
norm_layer = Normalization()
norm_layer.adapt(X_train)

In [77]:
model = keras.Sequential(
    [   
        norm_layer,                   # our normalisation layer recieves the input
        Dense(600, activation='relu'),  # the hidden layer gets the normalised result
        Dense(1000, activation='relu'),  # (in case you want to try an extra hidden layer)
        Dense(1000, activation='relu'),  # (in case you want to try an extra hidden layer)
        Dense(1000, activation='relu'),  # (in case you want to try an extra hidden layer)
        Dense(1000, activation='relu'),  # (in case you want to try an extra hidden layer)
        Dense(600, activation='relu'),  # (in case you want to try an extra hidden layer)
        Dense(600, activation='relu'),  # (in case you want to try an extra hidden layer)
        Dense(300, activation='relu'),  # (in case you want to try an extra hidden layer)
        Dense(100, activation='relu'),   # (in case you want to try an extra hidden layer)
        Dense(10, activation='relu'),   # (in case you want to try an extra hidden layer)
        Dense(1, activation='relu')
    ]
)

In [69]:
model.compile(
    optimizer='adam',  # Adam optimises using gradient descent, is generally fast and a good choice in many cases
    loss='MSE' # 'sparse_categorical_crossentropy'#'MSE'  # Mean Squared Error makes sense for this problem, 
                # though we could use Mean Absolute Error, or many other choices.
                # Classification outputs would use a different loss (eg. BinaryCrossentropy)
)

In [70]:
history = model.fit(
    x=X_train,
    y=y_train,
    batch_size=16,
    validation_split=0.25,
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [71]:
comparison = y_test.iloc[:100].copy()
comparison.loc[:, 'prediction'] = model.predict(X_test.head(100))
comparison



Unnamed: 0,weekly_rent,prediction
42254,5.736572,5.757770
82902,5.473670,5.507173
112153,6.152733,5.984366
70112,5.978886,5.905777
71786,6.040255,5.896652
...,...,...
151557,6.214608,6.212883
167959,6.052089,6.011229
53247,5.991465,5.854909
91680,6.109248,6.045076


In [72]:
model.evaluate(
    x=X_test.iloc[:100],
    y=y_test.iloc[:100],
    batch_size=16,
)



0.07896533608436584

In [73]:

predictions = model.predict(X_test.iloc[:])
errors = np.array(predictions - y_test.iloc[:])
squared_errors = errors**2
mean_squared_error = squared_errors.mean()

print(f'MSE: {mean_squared_error}')

MSE: 0.04656407931625362


In [74]:
tot_sum_squares = (np.array(y_test - y_test.mean())**2).sum()
r2 = 1 - (squared_errors.sum() / tot_sum_squares)
print(f'Model R^2: {r2:.4f}')

Model R^2: 0.6752


In [75]:
from sklearn.metrics import r2_score
r2_score(y_test, predictions)

0.6752064704217302

In [76]:
import numpy as np
np.exp(5.767483) - np.exp(5.736572)
np.exp(0.04783333211954551) # => +-4%

1.0489958068463299