## Purpose

The purpose of this NN model builder is to take what was learned through visualizations 
and correlations of the features and targets, and trim the unnecessary components to the NN models

## Revision

This is a notebook revision of [Neural_Network_Iterated.ipynb](./Neural_Network_Iterated.ipynb)

## Reading the Data

This portion remains unchanged to previous scripts. Skip forward to the [NN Model Building](#neural-network-model-building) for the first changes.

In [1]:
# Import our dependencies.
from sqlalchemy import create_engine
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

model_name = "mse_loss_model1"


In [2]:
# Pass in password once if there is no 'credentials.env' file in the PostgreSQL folder. 
# If there is a credentials.env file, it will replace the PG_PASS variable with the variable there
# If you use this variable, please remember to replace your password with '<pw>' again before saving and pushing your commit.
PG_PASS = '<pw>'

# Use environment variable for password. Only triggers if 'credentials.env' exists in the PostgreSQL folder.
if os.path.exists("../../PosgreSQL/credentials.env"):
    try:
        from dotenv import load_dotenv
        load_dotenv("../../PosgreSQL/credentials.env")

        # Check that the environment variable exists
        if os.getenv("PG_PASS") != None:
            PG_PASS = os.getenv("PG_PASS")
        else:
            print("Error retrieving PG_PASS variable from credentials.env")
            print("Check that it exists, or check the spelling of the os.getenv('PG_PASS)")
            print("PG_PASS = the default value entered at the top of this cell.")

    except ImportError:
        print("python-dotenv not installed. Consider installing using 'pip install python-dotenv' to use environment variables for passwords.")
    
    except Exception as err:
        print(f"ERROR: {err}")

postgresInfo = f'postgresql://postgres:{PG_PASS}@localhost/Weather_Crime'

In [3]:
# Connect to PostgreSQL
conn_string = postgresInfo
db = create_engine(conn_string)
conn = db.connect()

# Create Dataframe out of 'joined_data' table in PostgreSQL
joined_df = pd.read_sql_query('SELECT * FROM joined_data;', db)
# Closing the connection
conn.close()

joined_df

Unnamed: 0,date,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,rain,snow,...,occ_year,occ_month,occ_day,occ_dow,occ_hour,premises_type,hood_140,neighbourhood_140,long_wgs84,lat_wgs84
0,2015-01-01,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,...,2015,January,1,Thursday,23,House,115,Mount Dennis (115),-79.504668,43.693238
1,2015-01-01,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,...,2015,January,1,Thursday,3,Commercial,77,Waterfront Communities-The Island (77),-79.392855,43.647315
2,2015-01-01,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,...,2015,January,1,Thursday,2,Commercial,77,Waterfront Communities-The Island (77),-79.387700,43.649776
3,2015-01-01,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,...,2015,January,1,Thursday,0,Commercial,1,West Humber-Clairville (1),-79.595562,43.686751
4,2015-01-01,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,...,2015,January,1,Thursday,0,Outside,77,Waterfront Communities-The Island (77),-79.400096,43.645835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166507,2018-10-12,10.9,5.5,83,64.5,100.97,37,0.8,0.8,0.0,...,2018,October,12,Friday,0,,75,Church-Yonge Corridor (75),-79.376699,43.656958
166508,2018-10-18,8.4,-1.4,85,66.0,102.72,29,0.0,0.0,0.0,...,2018,October,18,Thursday,0,,120,Clairlea-Birchmount (120),-79.286292,43.699070
166509,2018-10-20,11.9,2.6,83,64.0,100.60,41,0.2,0.2,0.0,...,2018,October,20,Saturday,0,,124,Kennedy Park (124),-79.264551,43.732765
166510,2018-11-16,2.2,0.1,95,89.0,100.97,37,5.2,0.0,4.2,...,2018,November,16,Friday,0,,73,Moss Park (73),-79.369005,43.654614


# Data Preprocessing

In [4]:
# Print column names to see which columns to use
joined_df.columns

Index(['date', 'max_temperature', 'min_temperature', 'max_relative_humidity',
       'avg_relative_humidity', 'avg_pressure_sea', 'max_wind_speed',
       'precipitation', 'rain', 'snow', 'snow_on_ground', 'daylight',
       'avg_cloud_cover_8', 'event_unique_id', 'crime', 'occ_year',
       'occ_month', 'occ_day', 'occ_dow', 'occ_hour', 'premises_type',
       'hood_140', 'neighbourhood_140', 'long_wgs84', 'lat_wgs84'],
      dtype='object')

In [5]:
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166512 entries, 0 to 166511
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   date                   166512 non-null  object 
 1   max_temperature        166512 non-null  float64
 2   min_temperature        166512 non-null  float64
 3   max_relative_humidity  166512 non-null  int64  
 4   avg_relative_humidity  166512 non-null  float64
 5   avg_pressure_sea       166512 non-null  float64
 6   max_wind_speed         166512 non-null  int64  
 7   precipitation          166512 non-null  float64
 8   rain                   166512 non-null  float64
 9   snow                   166512 non-null  float64
 10  snow_on_ground         166512 non-null  float64
 11  daylight               166512 non-null  float64
 12  avg_cloud_cover_8      166512 non-null  float64
 13  event_unique_id        166512 non-null  object 
 14  crime                  166512 non-nu

In [6]:
# Create new dataframe by counting occurences of certain crimes and grouping by relevant date/weather features which have only 1 unique set of entries per date
daily_crime = joined_df.groupby(by=['date',"occ_dow", 'occ_month', 'max_temperature', 'min_temperature', 'max_relative_humidity',
       'avg_relative_humidity', 'avg_pressure_sea', 'max_wind_speed',
       'precipitation', 'rain', 'snow', 'snow_on_ground', 'daylight',
       'avg_cloud_cover_8'])["crime"].value_counts().unstack(fill_value=0).copy()
daily_crime

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,crime,assault,auto_theft,bicycle_theft,break_and_enter,homicide,robbery,shooting,theft_from_motor_vehicle,theft_over
date,occ_dow,occ_month,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,rain,snow,snow_on_ground,daylight,avg_cloud_cover_8,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2015-01-01,Thursday,January,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,0.0,9.00,4.5,158,7,5,21,0,8,0,34,5
2015-01-02,Friday,January,-1.0,-6.1,83,66.0,102.30,36,0.6,0.0,0.4,0.0,9.00,6.0,34,9,4,18,0,6,0,34,2
2015-01-03,Saturday,January,1.5,-5.8,97,82.5,101.96,29,10.8,8.0,2.4,0.0,9.00,5.0,37,10,0,4,0,8,0,21,3
2015-01-04,Sunday,January,4.3,-5.9,98,82.5,100.69,43,6.8,6.4,0.4,0.0,9.02,6.0,44,5,1,11,0,5,0,33,1
2015-01-05,Monday,January,-5.8,-14.4,83,69.5,102.38,45,0.8,0.0,0.8,1.0,9.05,3.5,39,6,1,24,0,5,1,31,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-27,Thursday,December,2.8,-7.4,87,74.0,102.52,30,5.4,5.4,0.0,0.0,8.93,4.0,38,7,2,21,0,7,0,30,1
2018-12-28,Friday,December,12.4,2.7,95,84.5,100.89,30,1.8,1.8,0.0,0.0,8.93,4.5,35,13,2,21,0,12,0,31,1
2018-12-29,Saturday,December,2.7,-7.0,80,68.0,101.86,34,0.0,0.0,0.0,0.0,8.95,4.5,37,10,3,26,0,7,0,21,4
2018-12-30,Sunday,December,0.7,-6.6,88,78.0,102.03,24,1.2,0.0,1.6,2.0,8.97,5.0,43,8,3,22,0,7,0,22,3


In [7]:
# Rename columns for data clarity
target_columns = [f'{column}_count' for column in daily_crime.columns]

daily_crime.columns = target_columns
daily_crime

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,assault_count,auto_theft_count,bicycle_theft_count,break_and_enter_count,homicide_count,robbery_count,shooting_count,theft_from_motor_vehicle_count,theft_over_count
date,occ_dow,occ_month,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,rain,snow,snow_on_ground,daylight,avg_cloud_cover_8,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2015-01-01,Thursday,January,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,0.0,9.00,4.5,158,7,5,21,0,8,0,34,5
2015-01-02,Friday,January,-1.0,-6.1,83,66.0,102.30,36,0.6,0.0,0.4,0.0,9.00,6.0,34,9,4,18,0,6,0,34,2
2015-01-03,Saturday,January,1.5,-5.8,97,82.5,101.96,29,10.8,8.0,2.4,0.0,9.00,5.0,37,10,0,4,0,8,0,21,3
2015-01-04,Sunday,January,4.3,-5.9,98,82.5,100.69,43,6.8,6.4,0.4,0.0,9.02,6.0,44,5,1,11,0,5,0,33,1
2015-01-05,Monday,January,-5.8,-14.4,83,69.5,102.38,45,0.8,0.0,0.8,1.0,9.05,3.5,39,6,1,24,0,5,1,31,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-27,Thursday,December,2.8,-7.4,87,74.0,102.52,30,5.4,5.4,0.0,0.0,8.93,4.0,38,7,2,21,0,7,0,30,1
2018-12-28,Friday,December,12.4,2.7,95,84.5,100.89,30,1.8,1.8,0.0,0.0,8.93,4.5,35,13,2,21,0,12,0,31,1
2018-12-29,Saturday,December,2.7,-7.0,80,68.0,101.86,34,0.0,0.0,0.0,0.0,8.95,4.5,37,10,3,26,0,7,0,21,4
2018-12-30,Sunday,December,0.7,-6.6,88,78.0,102.03,24,1.2,0.0,1.6,2.0,8.97,5.0,43,8,3,22,0,7,0,22,3


In [8]:
# Reset index to flatten the dataframe
daily_crime.reset_index(inplace=True)
daily_crime

Unnamed: 0,date,occ_dow,occ_month,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,...,avg_cloud_cover_8,assault_count,auto_theft_count,bicycle_theft_count,break_and_enter_count,homicide_count,robbery_count,shooting_count,theft_from_motor_vehicle_count,theft_over_count
0,2015-01-01,Thursday,January,-1.9,-8.1,69,60.0,101.27,36,0.0,...,4.5,158,7,5,21,0,8,0,34,5
1,2015-01-02,Friday,January,-1.0,-6.1,83,66.0,102.30,36,0.6,...,6.0,34,9,4,18,0,6,0,34,2
2,2015-01-03,Saturday,January,1.5,-5.8,97,82.5,101.96,29,10.8,...,5.0,37,10,0,4,0,8,0,21,3
3,2015-01-04,Sunday,January,4.3,-5.9,98,82.5,100.69,43,6.8,...,6.0,44,5,1,11,0,5,0,33,1
4,2015-01-05,Monday,January,-5.8,-14.4,83,69.5,102.38,45,0.8,...,3.5,39,6,1,24,0,5,1,31,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,2018-12-27,Thursday,December,2.8,-7.4,87,74.0,102.52,30,5.4,...,4.0,38,7,2,21,0,7,0,30,1
1457,2018-12-28,Friday,December,12.4,2.7,95,84.5,100.89,30,1.8,...,4.5,35,13,2,21,0,12,0,31,1
1458,2018-12-29,Saturday,December,2.7,-7.0,80,68.0,101.86,34,0.0,...,4.5,37,10,3,26,0,7,0,21,4
1459,2018-12-30,Sunday,December,0.7,-6.6,88,78.0,102.03,24,1.2,...,5.0,43,8,3,22,0,7,0,22,3


In [9]:
daily_crime.columns

Index(['date', 'occ_dow', 'occ_month', 'max_temperature', 'min_temperature',
       'max_relative_humidity', 'avg_relative_humidity', 'avg_pressure_sea',
       'max_wind_speed', 'precipitation', 'rain', 'snow', 'snow_on_ground',
       'daylight', 'avg_cloud_cover_8', 'assault_count', 'auto_theft_count',
       'bicycle_theft_count', 'break_and_enter_count', 'homicide_count',
       'robbery_count', 'shooting_count', 'theft_from_motor_vehicle_count',
       'theft_over_count'],
      dtype='object')

In [10]:
# save a copy of the unchanged features for the final CSV export
features_df = daily_crime.copy()

In [11]:
daily_crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   date                            1461 non-null   object 
 1   occ_dow                         1461 non-null   object 
 2   occ_month                       1461 non-null   object 
 3   max_temperature                 1461 non-null   float64
 4   min_temperature                 1461 non-null   float64
 5   max_relative_humidity           1461 non-null   int64  
 6   avg_relative_humidity           1461 non-null   float64
 7   avg_pressure_sea                1461 non-null   float64
 8   max_wind_speed                  1461 non-null   int64  
 9   precipitation                   1461 non-null   float64
 10  rain                            1461 non-null   float64
 11  snow                            1461 non-null   float64
 12  snow_on_ground                  14

In [12]:
# Convert date into Unix timestamp for NN model
from datetime import datetime
daily_crime['date'] = pd.to_datetime(daily_crime['date'])
daily_crime['unix_timestamp'] = daily_crime['date'].apply(lambda x: x.timestamp()).astype(int)
daily_crime

Unnamed: 0,date,occ_dow,occ_month,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,...,assault_count,auto_theft_count,bicycle_theft_count,break_and_enter_count,homicide_count,robbery_count,shooting_count,theft_from_motor_vehicle_count,theft_over_count,unix_timestamp
0,2015-01-01,Thursday,January,-1.9,-8.1,69,60.0,101.27,36,0.0,...,158,7,5,21,0,8,0,34,5,1420070400
1,2015-01-02,Friday,January,-1.0,-6.1,83,66.0,102.30,36,0.6,...,34,9,4,18,0,6,0,34,2,1420156800
2,2015-01-03,Saturday,January,1.5,-5.8,97,82.5,101.96,29,10.8,...,37,10,0,4,0,8,0,21,3,1420243200
3,2015-01-04,Sunday,January,4.3,-5.9,98,82.5,100.69,43,6.8,...,44,5,1,11,0,5,0,33,1,1420329600
4,2015-01-05,Monday,January,-5.8,-14.4,83,69.5,102.38,45,0.8,...,39,6,1,24,0,5,1,31,2,1420416000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,2018-12-27,Thursday,December,2.8,-7.4,87,74.0,102.52,30,5.4,...,38,7,2,21,0,7,0,30,1,1545868800
1457,2018-12-28,Friday,December,12.4,2.7,95,84.5,100.89,30,1.8,...,35,13,2,21,0,12,0,31,1,1545955200
1458,2018-12-29,Saturday,December,2.7,-7.0,80,68.0,101.86,34,0.0,...,37,10,3,26,0,7,0,21,4,1546041600
1459,2018-12-30,Sunday,December,0.7,-6.6,88,78.0,102.03,24,1.2,...,43,8,3,22,0,7,0,22,3,1546128000


In [13]:
# Remove date column
daily_crime.drop(columns=["date"], axis=1, inplace=True)
daily_crime

Unnamed: 0,occ_dow,occ_month,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,rain,...,assault_count,auto_theft_count,bicycle_theft_count,break_and_enter_count,homicide_count,robbery_count,shooting_count,theft_from_motor_vehicle_count,theft_over_count,unix_timestamp
0,Thursday,January,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,...,158,7,5,21,0,8,0,34,5,1420070400
1,Friday,January,-1.0,-6.1,83,66.0,102.30,36,0.6,0.0,...,34,9,4,18,0,6,0,34,2,1420156800
2,Saturday,January,1.5,-5.8,97,82.5,101.96,29,10.8,8.0,...,37,10,0,4,0,8,0,21,3,1420243200
3,Sunday,January,4.3,-5.9,98,82.5,100.69,43,6.8,6.4,...,44,5,1,11,0,5,0,33,1,1420329600
4,Monday,January,-5.8,-14.4,83,69.5,102.38,45,0.8,0.0,...,39,6,1,24,0,5,1,31,2,1420416000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,Thursday,December,2.8,-7.4,87,74.0,102.52,30,5.4,5.4,...,38,7,2,21,0,7,0,30,1,1545868800
1457,Friday,December,12.4,2.7,95,84.5,100.89,30,1.8,1.8,...,35,13,2,21,0,12,0,31,1,1545955200
1458,Saturday,December,2.7,-7.0,80,68.0,101.86,34,0.0,0.0,...,37,10,3,26,0,7,0,21,4,1546041600
1459,Sunday,December,0.7,-6.6,88,78.0,102.03,24,1.2,0.0,...,43,8,3,22,0,7,0,22,3,1546128000


In [14]:
# Encode the day of week column
enc = OneHotEncoder(sparse=False)

# Fit encoder and create dataframe
encode_df = pd.DataFrame(enc.fit_transform(daily_crime['occ_dow'].values.reshape(-1,1)))

# rename encoded columns
encode_df.columns = enc.get_feature_names_out(['occ_dow'])
encode_df

Unnamed: 0,occ_dow_Friday,occ_dow_Monday,occ_dow_Saturday,occ_dow_Sunday,occ_dow_Thursday,occ_dow_Tuesday,occ_dow_Wednesday
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1456,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1457,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1459,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
# Merge the 2 DataFrames together and drop the occ_down column
df = daily_crime.merge(encode_df, left_index=True, right_index=True).drop("occ_dow", axis=1)
df

Unnamed: 0,occ_month,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,rain,snow,...,theft_from_motor_vehicle_count,theft_over_count,unix_timestamp,occ_dow_Friday,occ_dow_Monday,occ_dow_Saturday,occ_dow_Sunday,occ_dow_Thursday,occ_dow_Tuesday,occ_dow_Wednesday
0,January,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,...,34,5,1420070400,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,January,-1.0,-6.1,83,66.0,102.30,36,0.6,0.0,0.4,...,34,2,1420156800,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,January,1.5,-5.8,97,82.5,101.96,29,10.8,8.0,2.4,...,21,3,1420243200,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,January,4.3,-5.9,98,82.5,100.69,43,6.8,6.4,0.4,...,33,1,1420329600,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,January,-5.8,-14.4,83,69.5,102.38,45,0.8,0.0,0.8,...,31,2,1420416000,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,December,2.8,-7.4,87,74.0,102.52,30,5.4,5.4,0.0,...,30,1,1545868800,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1457,December,12.4,2.7,95,84.5,100.89,30,1.8,1.8,0.0,...,31,1,1545955200,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,December,2.7,-7.0,80,68.0,101.86,34,0.0,0.0,0.0,...,21,4,1546041600,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1459,December,0.7,-6.6,88,78.0,102.03,24,1.2,0.0,1.6,...,22,3,1546128000,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
# Encode the month column
enc = OneHotEncoder(sparse=False)

# Fit encoder and create dataframe
encode_df = pd.DataFrame(enc.fit_transform(daily_crime['occ_month'].values.reshape(-1,1)))

# rename encoded columns
encode_df.columns = enc.get_feature_names_out(['occ_month'])
encode_df

Unnamed: 0,occ_month_April,occ_month_August,occ_month_December,occ_month_February,occ_month_January,occ_month_July,occ_month_June,occ_month_March,occ_month_May,occ_month_November,occ_month_October,occ_month_September
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1456,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1459,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Merge the 2 DataFrames together and drop the occ_month column
df = df.merge(encode_df, left_index=True, right_index=True).drop("occ_month", axis=1)
df

Unnamed: 0,max_temperature,min_temperature,max_relative_humidity,avg_relative_humidity,avg_pressure_sea,max_wind_speed,precipitation,rain,snow,snow_on_ground,...,occ_month_December,occ_month_February,occ_month_January,occ_month_July,occ_month_June,occ_month_March,occ_month_May,occ_month_November,occ_month_October,occ_month_September
0,-1.9,-8.1,69,60.0,101.27,36,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.0,-6.1,83,66.0,102.30,36,0.6,0.0,0.4,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.5,-5.8,97,82.5,101.96,29,10.8,8.0,2.4,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.3,-5.9,98,82.5,100.69,43,6.8,6.4,0.4,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-5.8,-14.4,83,69.5,102.38,45,0.8,0.0,0.8,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,2.8,-7.4,87,74.0,102.52,30,5.4,5.4,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,12.4,2.7,95,84.5,100.89,30,1.8,1.8,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,2.7,-7.0,80,68.0,101.86,34,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1459,0.7,-6.6,88,78.0,102.03,24,1.2,0.0,1.6,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df.columns

Index(['max_temperature', 'min_temperature', 'max_relative_humidity',
       'avg_relative_humidity', 'avg_pressure_sea', 'max_wind_speed',
       'precipitation', 'rain', 'snow', 'snow_on_ground', 'daylight',
       'avg_cloud_cover_8', 'assault_count', 'auto_theft_count',
       'bicycle_theft_count', 'break_and_enter_count', 'homicide_count',
       'robbery_count', 'shooting_count', 'theft_from_motor_vehicle_count',
       'theft_over_count', 'unix_timestamp', 'occ_dow_Friday',
       'occ_dow_Monday', 'occ_dow_Saturday', 'occ_dow_Sunday',
       'occ_dow_Thursday', 'occ_dow_Tuesday', 'occ_dow_Wednesday',
       'occ_month_April', 'occ_month_August', 'occ_month_December',
       'occ_month_February', 'occ_month_January', 'occ_month_July',
       'occ_month_June', 'occ_month_March', 'occ_month_May',
       'occ_month_November', 'occ_month_October', 'occ_month_September'],
      dtype='object')

In [19]:
# Reorganize the dataframe to contain all features first and all intended features at the end

df = df[['max_temperature', 'min_temperature', 'max_relative_humidity',
       'avg_relative_humidity', 'avg_pressure_sea', 'max_wind_speed',
       'precipitation', 'rain', 'snow', 'snow_on_ground', 'daylight',
       'avg_cloud_cover_8', 'unix_timestamp',
       'occ_dow_Monday', 'occ_dow_Tuesday',  'occ_dow_Wednesday', 'occ_dow_Thursday', 
       'occ_dow_Friday', 'occ_dow_Saturday', 'occ_dow_Sunday',
       'occ_month_January', 'occ_month_February', 'occ_month_March',
       'occ_month_April', 'occ_month_May', 'occ_month_June',
       'occ_month_July', 'occ_month_August', 'occ_month_September',
       'occ_month_October', 'occ_month_November', 'occ_month_December',
       'assault_count', 'auto_theft_count','bicycle_theft_count', 
       'break_and_enter_count', 'homicide_count',
       'robbery_count', 'shooting_count', 'theft_from_motor_vehicle_count',
       'theft_over_count']]

#

# Neural Network Model Building

## Identify Features and Potential Targets

We will be removing targets:
- Homicides
- Shootings
- Theft Over

In [20]:
# Print column names to see which columns to use
df.columns

Index(['max_temperature', 'min_temperature', 'max_relative_humidity',
       'avg_relative_humidity', 'avg_pressure_sea', 'max_wind_speed',
       'precipitation', 'rain', 'snow', 'snow_on_ground', 'daylight',
       'avg_cloud_cover_8', 'unix_timestamp', 'occ_dow_Monday',
       'occ_dow_Tuesday', 'occ_dow_Wednesday', 'occ_dow_Thursday',
       'occ_dow_Friday', 'occ_dow_Saturday', 'occ_dow_Sunday',
       'occ_month_January', 'occ_month_February', 'occ_month_March',
       'occ_month_April', 'occ_month_May', 'occ_month_June', 'occ_month_July',
       'occ_month_August', 'occ_month_September', 'occ_month_October',
       'occ_month_November', 'occ_month_December', 'assault_count',
       'auto_theft_count', 'bicycle_theft_count', 'break_and_enter_count',
       'homicide_count', 'robbery_count', 'shooting_count',
       'theft_from_motor_vehicle_count', 'theft_over_count'],
      dtype='object')

In [21]:
# Define X and y column names
X_cols = ['max_temperature', 'min_temperature', 'max_relative_humidity',
       'avg_relative_humidity', 'avg_pressure_sea', 'max_wind_speed',
       'precipitation', 'rain', 'snow', 'snow_on_ground', 'daylight',
       'avg_cloud_cover_8', 'unix_timestamp', 'occ_dow_Monday',
       'occ_dow_Tuesday', 'occ_dow_Wednesday', 'occ_dow_Thursday',
       'occ_dow_Friday', 'occ_dow_Saturday', 'occ_dow_Sunday',
       'occ_month_January', 'occ_month_February', 'occ_month_March',
       'occ_month_April', 'occ_month_May', 'occ_month_June', 'occ_month_July',
       'occ_month_August', 'occ_month_September', 'occ_month_October',
       'occ_month_November', 'occ_month_December']

y_cols = ['assault_count', 'auto_theft_count', 'bicycle_theft_count', 
          'break_and_enter_count', 'robbery_count', 'theft_from_motor_vehicle_count']

# Define features. Target will be defined through iteration in a couple cells
X = df[X_cols]

In [22]:
neural_models_dict ={}

## Iterate through Target Columns to create NN Models

In [23]:
for y_col in y_cols:
    # Define the target using the y_col item
    y = df[y_col].values.reshape(-1,1)

    # Start populating the neural_models_dict dictionary
    neural_models_dict[y_col] = {}

    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # Standardize data with Standard Scaler
    scaler = StandardScaler()

    # Fit the scaler
    X_scaler = scaler.fit(X_train)


    # Scale the features
    X_train_scaled = X_scaler.fit_transform(X_train)
    X_test_scaled = X_scaler.fit_transform(X_test)

    # Define the model - deep neural net
    number_input_features = len(X_train_scaled[0])
    hidden_nodes_layer1 = number_input_features
    hidden_nodes_layer2 = number_input_features

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
    )

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

    # Compile the model
    nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mae"])

    # Train the model
    fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

    # Evaluate the model using the test data
    model_loss, model_mae = nn.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, MAE: {model_mae}")

    # Make predictions from y_test
    y_pred = nn.predict(X_test_scaled)

    # Store everything that would be needed for future use into the dictionary object
    neural_models_dict[y_col]["nn"] = nn
    neural_models_dict[y_col]["fit_model"] = fit_model
    neural_models_dict[y_col]["model_loss"] = model_loss
    neural_models_dict[y_col]["model_mae"] = model_mae
    neural_models_dict[y_col]["X_test_scaled"] = X_test_scaled

    # Save the X_scaler in case we need to transform future data for predictions
    neural_models_dict[y_col]["X_scaler"] = X_scaler

    # Store the necessary objects for prediction analysis
    neural_models_dict[y_col]["y_test"] = y_test
    neural_models_dict[y_col]["y_pred"] = y_pred


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [24]:
# Evaluate the models using the test data
for target, nn_content in neural_models_dict.items():
    test_data_info = nn_content['y_test']
    print("---------------------------------------")
    print(f"~~~~~~ '{target}' Test Data Info ~~~~~~")
    print(f"Min: {test_data_info.min()}")
    print(f"Max: {test_data_info.max()}")
    print(f"Mean: {test_data_info.mean()}")
    print(f"STD: {test_data_info.std()}")
    print(f"***** Model Performance *****")
    print(f"Loss: {nn_content['model_loss']}, MAE: {nn_content['model_mae']}")
    print()

---------------------------------------
~~~~~~ 'assault_count' Test Data Info ~~~~~~
Min: 21
Max: 133
Mean: 43.19945355191257
STD: 10.18308356826286
***** Model Performance *****
Loss: 113.2642593383789, MAE: 7.447976589202881

---------------------------------------
~~~~~~ 'auto_theft_count' Test Data Info ~~~~~~
Min: 2
Max: 25
Mean: 9.489071038251366
STD: 3.6340287162872777
***** Model Performance *****
Loss: 14.271126747131348, MAE: 3.051727056503296

---------------------------------------
~~~~~~ 'bicycle_theft_count' Test Data Info ~~~~~~
Min: 0
Max: 26
Mean: 8.382513661202186
STD: 6.0337049867639125
***** Model Performance *****
Loss: 15.103874206542969, MAE: 2.8448426723480225

---------------------------------------
~~~~~~ 'break_and_enter_count' Test Data Info ~~~~~~
Min: 5
Max: 35
Mean: 18.4672131147541
STD: 5.879738888057919
***** Model Performance *****
Loss: 31.350404739379883, MAE: 4.502387523651123

---------------------------------------
~~~~~~ 'robbery_count' Test Data

In [25]:
# Pickle the dictionary to save the state of everything contained.
import pickle

os.makedirs("saved_models/pickle", exist_ok=True)
with open(f'saved_models/pickle/{model_name}_neural_dict.pickle', 'wb') as file:
    pickle.dump(neural_models_dict, file)

In [26]:
# Create a prediction dataframe
predictions_df = pd.DataFrame()
model_comparisons = pd.DataFrame()

for target, nn_content in neural_models_dict.items():
    predictions_df[f'{target}_true_value'] = nn_content['y_test'].flatten()
    predictions_df[f'{target}_pred_value'] = nn_content['y_pred'].flatten()
    predictions_df[f'{target}_pred_value'] = predictions_df[f'{target}_pred_value'].astype(int)
    predictions_df[f'{target}_pred_delta'] = predictions_df[f'{target}_true_value'] - predictions_df[f'{target}_pred_value']

predictions_df

Unnamed: 0,assault_count_true_value,assault_count_pred_value,assault_count_pred_delta,auto_theft_count_true_value,auto_theft_count_pred_value,auto_theft_count_pred_delta,bicycle_theft_count_true_value,bicycle_theft_count_pred_value,bicycle_theft_count_pred_delta,break_and_enter_count_true_value,break_and_enter_count_pred_value,break_and_enter_count_pred_delta,robbery_count_true_value,robbery_count_pred_value,robbery_count_pred_delta,theft_from_motor_vehicle_count_true_value,theft_from_motor_vehicle_count_pred_value,theft_from_motor_vehicle_count_pred_delta
0,60,57,3,6,12,-6,16,13,3,16,22,-6,3,5,-2,20,28,-8
1,43,43,0,7,10,-3,1,1,0,19,20,-1,6,9,-3,20,26,-6
2,49,34,15,16,5,11,3,0,3,15,16,-1,9,5,4,20,20,0
3,47,50,-3,6,8,-2,14,17,-3,18,17,1,7,7,0,16,24,-8
4,38,45,-7,12,15,-3,3,4,-1,26,20,6,11,7,4,16,20,-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,77,47,30,7,9,-2,15,15,0,17,22,-5,9,8,1,31,18,13
362,60,45,15,10,11,-1,12,14,-2,20,19,1,8,9,-1,24,25,-1
363,42,41,1,6,7,-1,10,7,3,32,21,11,6,10,-4,26,21,5
364,36,37,-1,18,12,6,6,9,-3,16,20,-4,12,10,2,28,24,4


In [27]:
export_df = predictions_df.merge(daily_crime, right_index=True, left_index=True)

In [28]:
# Export predictions_df to CSV

os.makedirs('../../Data/model_predictions', exist_ok=True)
export_df.to_csv(f"../../Data/model_predictions/{model_name}_pred_vs_true.csv")

# Prediction Exploration

In [29]:
predictions_df.columns

Index(['assault_count_true_value', 'assault_count_pred_value',
       'assault_count_pred_delta', 'auto_theft_count_true_value',
       'auto_theft_count_pred_value', 'auto_theft_count_pred_delta',
       'bicycle_theft_count_true_value', 'bicycle_theft_count_pred_value',
       'bicycle_theft_count_pred_delta', 'break_and_enter_count_true_value',
       'break_and_enter_count_pred_value', 'break_and_enter_count_pred_delta',
       'robbery_count_true_value', 'robbery_count_pred_value',
       'robbery_count_pred_delta', 'theft_from_motor_vehicle_count_true_value',
       'theft_from_motor_vehicle_count_pred_value',
       'theft_from_motor_vehicle_count_pred_delta'],
      dtype='object')

In [30]:
predictions_df.describe()

Unnamed: 0,assault_count_true_value,assault_count_pred_value,assault_count_pred_delta,auto_theft_count_true_value,auto_theft_count_pred_value,auto_theft_count_pred_delta,bicycle_theft_count_true_value,bicycle_theft_count_pred_value,bicycle_theft_count_pred_delta,break_and_enter_count_true_value,break_and_enter_count_pred_value,break_and_enter_count_pred_delta,robbery_count_true_value,robbery_count_pred_value,robbery_count_pred_delta,theft_from_motor_vehicle_count_true_value,theft_from_motor_vehicle_count_pred_value,theft_from_motor_vehicle_count_pred_delta
count,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0
mean,43.199454,43.434426,-0.234973,9.489071,8.945355,0.543716,8.382514,8.314208,0.068306,18.467213,18.508197,-0.040984,7.655738,7.489071,0.166667,22.95082,22.483607,0.467213
std,10.197023,7.355163,10.634886,3.639003,2.601847,3.805116,6.041965,6.070108,3.899874,5.887788,3.550603,5.589713,3.038846,2.090721,3.571897,6.452591,4.350217,6.759085
min,21.0,23.0,-26.0,2.0,2.0,-9.0,0.0,0.0,-21.0,5.0,7.0,-15.0,0.0,2.0,-10.0,6.0,7.0,-18.0
25%,37.0,39.0,-6.0,7.0,7.0,-2.0,3.0,3.0,-2.0,14.0,16.0,-4.0,6.0,6.0,-2.0,18.25,20.0,-4.0
50%,43.0,43.0,-1.0,9.0,9.0,0.0,7.0,7.0,0.0,18.0,19.0,0.0,7.0,7.0,0.0,22.0,22.5,0.0
75%,48.75,48.0,5.0,11.0,11.0,3.0,13.0,13.0,2.0,22.0,21.0,3.0,9.0,9.0,2.0,27.0,25.0,5.0
max,133.0,65.0,99.0,25.0,15.0,11.0,26.0,28.0,15.0,35.0,27.0,17.0,23.0,13.0,15.0,47.0,36.0,21.0


In [34]:
predictions_df.sort_values(by="auto_theft_count_pred_delta", ascending=False)

Unnamed: 0,assault_count_true_value,assault_count_pred_value,assault_count_pred_delta,auto_theft_count_true_value,auto_theft_count_pred_value,auto_theft_count_pred_delta,bicycle_theft_count_true_value,bicycle_theft_count_pred_value,bicycle_theft_count_pred_delta,break_and_enter_count_true_value,break_and_enter_count_pred_value,break_and_enter_count_pred_delta,robbery_count_true_value,robbery_count_pred_value,robbery_count_pred_delta,theft_from_motor_vehicle_count_true_value,theft_from_motor_vehicle_count_pred_value,theft_from_motor_vehicle_count_pred_delta
2,49,34,15,16,5,11,3,0,3,15,16,-1,9,5,4,20,20,0
279,38,47,-9,25,14,11,3,6,-3,27,19,8,7,11,-4,24,24,0
353,31,37,-6,18,8,10,0,3,-3,25,23,2,5,12,-7,26,24,2
354,50,37,13,16,6,10,4,2,2,24,16,8,12,7,5,21,12,9
44,37,44,-7,15,5,10,9,8,1,23,21,2,12,7,5,22,20,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,57,53,4,4,11,-7,13,13,0,10,17,-7,9,8,1,22,21,1
191,45,37,8,6,13,-7,6,3,3,16,20,-4,7,6,1,25,21,4
133,59,61,-2,5,12,-7,12,14,-2,18,17,1,7,8,-1,28,27,1
95,41,46,-5,4,12,-8,18,23,-5,22,20,2,5,9,-4,24,24,0
