In [1]:
import pandas as pd
import numpy as np
from Data_Processing import DataProcessing
from sklearn.model_selection import train_test_split
import joblib

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.metrics import mean_squared_error

In [2]:
pop = pd.read_csv('../Data/population.csv')
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

In [3]:
X, y, test = DataProcessing(train, test, pop)

In [4]:
X.shape

(478741, 78)

## Neural Network

In [48]:
mc = ModelCheckpoint(f'../Models/Neural_Network.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=7,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)

model = keras.Sequential([
    #keras.layers.InputLayer(78),
    keras.layers.Dense(78, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(50, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(25, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    #keras.layers.Dense(10, activation='selu', kernel_initializer='lecun_normal'),
    #keras.layers.BatchNormalization(),
    keras.layers.Dense(3, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(1, activation='selu', kernel_initializer='lecun_normal')
])

In [67]:
model = keras.Sequential([
    keras.layers.Dense(78, activation='relu'),
    keras.layers.Dense(50, activation='relu'),
    #keras.layers.Dense(25, activation='relu'),
    #keras.layers.Dense(5, activation='relu'),
    keras.layers.Dense(1, activation='relu')
])

In [68]:
optimizer = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(optimizer=optimizer,
             loss='mean_squared_error',
             metrics=[keras.metrics.RootMeanSquaredError()])

In [69]:
history = model.fit(
    X,
    y,
    batch_size=2000,
    epochs=5000,
    validation_split=0.2,
    callbacks=[mc, early_stopping],
    shuffle=True,
    use_multiprocessing=True
    )

Epoch 1/5000
Epoch 1: val_loss did not improve from 0.65557
Epoch 2/5000
Epoch 2: val_loss did not improve from 0.65557
Epoch 3/5000
Epoch 3: val_loss did not improve from 0.65557
Epoch 4/5000
Epoch 4: val_loss did not improve from 0.65557
Epoch 5/5000
Epoch 5: val_loss did not improve from 0.65557
Epoch 6/5000
Epoch 6: val_loss did not improve from 0.65557
Epoch 7/5000
Epoch 7: val_loss did not improve from 0.65557
Epoch 8/5000
Epoch 8: val_loss did not improve from 0.65557
Epoch 9/5000
Epoch 9: val_loss did not improve from 0.65557
Epoch 10/5000
Epoch 10: val_loss did not improve from 0.65557
Epoch 11/5000
Epoch 11: val_loss did not improve from 0.65557
Epoch 12/5000
Epoch 12: val_loss did not improve from 0.65557
Epoch 13/5000
Epoch 13: val_loss did not improve from 0.65557
Epoch 14/5000
Epoch 14: val_loss did not improve from 0.65557
Epoch 15/5000
Epoch 15: val_loss did not improve from 0.65557
Epoch 16/5000
Epoch 16: val_loss did not improve from 0.65557
Epoch 17/5000
Epoch 17: va

# Validation

In [70]:
model = load_model('../Models/Neural_Network.h5')

y_true = y
y_pred = model.predict(X)

from sklearn.metrics import r2_score

r2_score(y_true, y_pred)

0.012644682649172734

In [71]:
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

In [72]:
train_zips = train_df['postcode']

In [73]:
df = pd.DataFrame({'postcode': train_zips, 'y_true': y_true.flatten(), 'y_pred': y_pred.flatten()}, columns=['postcode', 'y_true', 'y_pred'])
df = df.groupby('postcode').mean()

In [74]:
df

Unnamed: 0_level_0,y_true,y_pred
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AB10 1AU,1.500000,1.415062
AB10 1BD,1.200000,1.425135
AB10 1BW,2.000000,1.410933
AB10 1HT,1.000000,1.497033
AB10 1NJ,1.428571,1.445176
...,...,...
ZE2 9QJ,1.500000,1.445190
ZE2 9QS,1.600000,1.484921
ZE2 9RE,1.500000,1.359639
ZE2 9SB,1.400000,1.451238


# Submission Set

In [75]:
y_pred = model.predict(test)

test_zips = test_df['postcode']
test_df = pd.DataFrame({'postcode': test_zips, 'Accident_risk_index': y_pred.flatten()}, columns=['postcode', 'Accident_risk_index'])

submission = test_df.groupby('postcode').mean().reset_index()

submission.to_csv('../Submissions/First_Try.csv', index=False)
submission

Unnamed: 0,postcode,Accident_risk_index
0,AB10 1AU,1.395012
1,AB10 1PG,1.365194
2,AB10 1TT,1.546399
3,AB10 1YP,1.417821
4,AB10 6LQ,1.474229
...,...,...
49767,ZE2 9LZ,1.477347
49768,ZE2 9RE,1.386834
49769,ZE2 9RJ,1.355922
49770,ZE2 9SB,1.487988
