<a href="https://colab.research.google.com/github/MattIzon/16010269_DataAnalytics/blob/main/9_Create_DNN_Test_Generic_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
# Set-up
import pandas as pd
import numpy as np
from google.colab import files

pd.set_option('display.max_columns', None)


def one_hot(df, column):
    o_h = pd.get_dummies(df[column])
    new = pd.concat([df, o_h], axis=1)
    new = new.drop([column], axis=1)
    return new
    

def dlcsv(df,name):
  df.to_csv('{}.csv'.format(name), index=False)
  files.download('{}.csv'.format(name))

In [82]:
#combine weather and crime for each year

data= []

for year in range(2015,2020):
  weather = pd.read_csv('https://raw.githubusercontent.com/MattIzon/16010269_DataAnalytics/main/data/w_{}.csv'.format(year)).set_index('date')
  crime = pd.read_csv('https://raw.githubusercontent.com/MattIzon/16010269_DataAnalytics/main/data/c_district_{}.csv'.format(year)).set_index('date')
  crime.drop('Unnamed: 0', axis=1, inplace=True)
  data.append(crime.join(weather).reset_index())

In [83]:
#add year column
year = 2015
for df in data:
  df['year'] = year
  year += 1

In [27]:
# Remove outliers in Target (more than 2 standard deviations from mean)
adjusted = []

for year in range(5):
  df = data[year].copy()
  upper = df.crime_count.mean() + (2 * df.crime_count.std())
  lower = df.crime_count.mean() - (2 * df.crime_count.std())

  df.drop(df[df.crime_count > upper].index, inplace=True)
  df.drop(df[df.crime_count < lower].index, inplace=True)

  adjusted.append(df)

In [84]:
# Combine all years into single dataset
combined = pd.concat(data).reset_index()
# combined = pd.concat(adjusted).reset_index()

combined.drop('index', axis=1, inplace=True)

In [85]:
# one hot encode day_of_week
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

for index in range(len(days)):
  value = index + 1
  combined.loc[combined.day_of_week == value, 'day_of_week'] = days[index]

combined = one_hot(combined, 'day_of_week')

In [86]:
# one hot encode mo
days = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

for index in range(len(days)):
  value = index + 1
  combined.loc[combined.mo == value, 'mo'] = days[index]

combined = one_hot(combined, 'mo')

In [87]:
# one hot encode district
combined.district = df.district.apply(np.int64)
districts = list(combined.district.unique())
combined = one_hot(combined, 'district')
for district in districts:
  combined.rename({district: 'district_{}'.format(district)}, axis=1, inplace=True)

In [89]:
# Shuffle the dataset
rand_combined = combined.iloc[np.random.permutation(len(combined))]

In [92]:
# Save testing and generic datasets
test_size = int(len(rand_combined)*0.01)
test_set = rand_combined.iloc[:test_size]
dlcsv(test_set, 'DNN_test_set')

generic_set = rand_combined.iloc[test_size:]
dlcsv(generic_set, 'DNN_generic_set')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [93]:
generic_set

Unnamed: 0,date,crime_count,temp,dewp,slp,stp,visib,wdsp,mxpsd,gust,max,min,prcp,sndp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,tornado_funnel_cloud,year,Fri,Mon,Sat,Sun,Thu,Tue,Wed,Apr,Aug,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep,district_1.0,district_2.0,district_3.0,district_4.0,district_5.0,district_6.0,district_7.0,district_8.0,district_9.0,district_10.0,district_11.0,district_12.0,district_14.0,district_15.0,district_16.0,district_17.0,district_18.0,district_19.0,district_20.0,district_22.0,district_24.0,district_25.0,district_31.0
12026,2016-06-29,14,66.566667,49.033333,1019.60,996.933333,9.966667,5.766667,10.300000,17.100000,78.633333,54.966667,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
38898,2019-11-02,51,38.933333,32.033333,1020.40,996.733333,9.000000,9.666667,17.033333,27.733333,44.800000,31.666667,0.003333,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2019,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5981,2015-09-29,32,66.000000,59.633333,1014.70,992.466667,9.333333,8.733333,19.000000,26.666667,78.700000,59.366667,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2015,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2866,2015-05-11,31,57.633333,53.366667,1012.10,988.900000,7.300000,8.600000,19.700000,27.633333,69.466667,45.533333,0.086667,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2015,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23609,2017-12-07,17,25.200000,14.000000,1020.65,996.433333,9.333333,10.966667,15.300000,20.333333,33.100000,18.966667,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2017,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24404,2018-01-13,39,14.066667,1.400000,1034.15,1008.766667,9.066667,11.366667,20.666667,28.600000,23.100000,7.300000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2018,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23762,2017-12-14,16,25.266667,15.600000,1014.20,989.333333,9.833333,10.133333,22.400000,31.633333,34.766667,18.100000,0.000000,1.2,0.0,0.0,1.0,0.0,0.0,0.0,2017,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15162,2016-11-19,33,38.533333,25.866667,1017.00,993.533333,10.000000,17.900000,24.433333,33.700000,49.400000,34.366667,0.165000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2016,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15269,2016-11-23,33,39.600000,35.366667,1018.55,994.700000,5.600000,8.600000,13.000000,17.600000,43.466667,32.133333,0.133333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
