<a href="https://colab.research.google.com/github/MeghanBibb/PredictingAccidentSeverityDNN/blob/main/Meghan_Bibb_Data_Mining_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The following code was used in the creation of [this paper](https://drive.google.com/file/d/1rOrnepeF9jt2GiKf3o6XsBayvTjz-POz/view?usp=sharing). 

## References

How to import data to colab:
https://towardsdatascience.com/importing-data-to-google-colab-the-clean-way-5ceef9e9e3c8

How to plot a pie chart in matplotlib
https://matplotlib.org/3.1.1/gallery/pie_and_polar_charts/pie_features.html

One hot encoding https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/ 

Cramer's V
https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

Data frame manipulation
https://www.geeksforgeeks.org/python-creating-a-pandas-dataframe-column-based-on-a-given-condition/


**Accademic Articles:**

https://www.sciencedirect.com/science/article/pii/S2452247318302164 - Cramer's V Threshold explaination

https://dlp-kdd.github.io/dlp-kdd2019/assets/pdf/a11-wu.pdf - Counting Features - an improvement on One-hot-encoding for large scale categorical data https://dl.acm.org/doi/10.1145/3326937.3341260

Traffic Severity in South Africa:
https://dl.acm.org/doi/pdf/10.1145/3325112.3325211?casa_token=yByFlPG5nJsAAAAA:Zt4du5F36pGjlG_R8RWcCp8ZaCJ3s7ep5j3rUShhGilOzQ4Sp8QqopcFkC8EdIV7iA10RuhvpwgwIg 

Accident predictions using big data https://dl.acm.org/doi/pdf/10.1145/3386723.3387886 

The dataset itself: https://arxiv.org/abs/1906.05409

Accident Risk Prediction based on Heterogeneous Sparse Data: https://arxiv.org/abs/1909.09638 

https://arxiv.org/pdf/1909.10702.pdf - dimensionality reduction



# Set Up

In [None]:
!pip install dython


**This colab mounts to your google drive to read files from. The files needed are located at: https://drive.google.com/drive/folders/1Yr04lpJ09MgbeqmHPq88aVBX7jOTAd__?usp=sharing  Please make a copy of this folder and place it in your drive.**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Please enter the path to the data
pathToData = "/content/drive/My Drive/Baylor/Data_Mining/Project/" #@param {type:"string"}

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# Data Exploration & Visulization

In [None]:
df = pd.read_csv(pathToData + 'TX_Accidents_June20.csv')

In [None]:
df

In [None]:
plt.style.available

In [None]:
severityCounts = df['Severity'].value_counts()

plt.figure(figsize=(4.66,3))
plt.style.use('seaborn-whitegrid')
plt.bar(severityCounts.index, severityCounts.values, color='darkblue')
#plt.title('Number of occurances for each severity classifcation', fontsize=16)
plt.ylabel('Quantity of Accidents', fontsize=18)
plt.xlabel('Severity Classifcation', fontsize=18)
plt.xticks(np.arange(1, 5, step=1))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

## POI Visulizations

In [None]:
arr = np.empty([0])

poiCounts = df[df['Amenity'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Bump'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Crossing'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Give_Way'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Junction'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['No_Exit'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Railway'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Roundabout'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Station'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Stop'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Traffic_Calming'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Traffic_Signal'] == True]
arr = np.append(arr, len(poiCounts))

poiCounts = df[df['Turning_Loop'] == True]
arr = np.append(arr, len(poiCounts))

fig, ax = plt.subplots()

POINames = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 
          'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']

POINamesLabels = np.array([0,1,2,3,4,5,6,7,8,9,10,11,12]);
for tick in ax.yaxis.get_major_ticks():
                tick.label.set_fontsize(18) 
plt.bar(POINamesLabels, arr, color='darkblue')
fig.canvas.draw()

plt.ylabel('Quantity of Accidents', fontsize=18)
plt.xlabel('POI Locations', fontsize=18)
ax.set_xticks(np.arange(0, len(POINamesLabels), step=1))

ax.set_xticklabels(POINames, fontsize=18)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor")

fig.set_size_inches(4.66,3)
plt.style.use('seaborn-whitegrid')
plt.show()


## Weather Conditions


In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="Weather_Condition", hue="Severity", data=df, order=df.Weather_Condition.value_counts().iloc[:11].index)
plt.xticks(rotation=90)
plt.ylabel('Number of Accidents')

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="Weather_Condition", data=df, order=df.Weather_Condition.value_counts().iloc[:11].index)
plt.xticks(rotation=90)
plt.ylabel('Number of Accidents')


In [None]:
import seaborn as sns
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(4.66,3))
ax = sns.countplot(x="City", data=df, order=df.City.value_counts().iloc[:6].index)
plt.xticks(rotation=20, fontsize=15)
plt.ylabel('Number of Accidents', fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('City', fontsize=16)

In [None]:
import seaborn as sns
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(4.66,3))
ax = sns.countplot(x="Side", hue="Severity", data=df)
ax.legend(loc='center right', bbox_to_anchor=(1.25, 0.5), ncol=1, fontsize=18, title="Severity")
plt.ylabel('Number of Accidents', fontsize=18)
plt.setp(ax.get_legend().get_texts(), fontsize='18') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='18') # for legend title
plt.yticks(fontsize=18)
plt.xlabel('Side', fontsize=18)
plt.xticks(fontsize=18)

In [None]:
import seaborn as sns
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(4.66,3))
ax = sns.countplot(x="Sunrise_Sunset", hue="Severity", data=df)
ax.legend(loc='center right', bbox_to_anchor=(1.25, 0.5), ncol=1, fontsize=18, title="Severity")
plt.setp(ax.get_legend().get_texts(), fontsize='18') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='18') # for legend title
plt.yticks(fontsize=18)
plt.xticks(fontsize=18)
plt.ylabel('Quantity of Accidents', fontsize=18)
plt.xlabel('Sunrise_Sunset', fontsize=18)

In [None]:
import seaborn as sns
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(4.66,3))

ax = sns.countplot(x="Source", hue="Severity", data=df,  order=df.Source.value_counts().iloc[:2].index)
for tick in ax.yaxis.get_major_ticks():
                tick.label.set_fontsize(18) 
for tick in ax.xaxis.get_major_ticks():
                tick.label.set_fontsize(18)  

ax.legend(loc='center right', bbox_to_anchor=(1.25, 0.5), ncol=1, fontsize=18, title="Severity")

plt.setp(ax.get_legend().get_texts(), fontsize='18') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='18') # for legend title               

plt.yticks(fontsize=18)
plt.xticks(fontsize=18)
plt.ylabel('Number of Accidents', fontsize=18)
plt.xlabel('Source', fontsize=18)

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.scatterplot(x="Temperature(F)", y="Severity", data=df, hue="Severity")

In [None]:
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(4.66,3))
ax = sns.boxplot(x="Severity", y="Temperature(F)", hue="Severity", data=df)
ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5), ncol=1, fontsize=18, title="Severity")
plt.setp(ax.get_legend().get_texts(), fontsize='18') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='18') # for legend title               

plt.yticks(fontsize=18)
plt.xticks(fontsize=18)
plt.ylabel('Temerature(F)', fontsize=18)
plt.xlabel('Severity', fontsize=18)

## Accidents per Hour Visulizations

In [None]:
df4Severity = df[df['Severity'] == 4]
df3Severity = df[df['Severity'] == 3]
df2Severity = df[df['Severity'] == 2]
df1Severity = df[df['Severity'] == 1]

In [None]:
print(len(df1Severity))
print(len(df2Severity))
print(len(df3Severity))
print(len(df4Severity))

In [None]:
df["Start_Time_Hour"] = df['Start_Time'].str.split(" ",expand=True)[1]

In [None]:
import time
hours = pd.to_datetime(df["Start_Time_Hour"]).dt.hour
print(hours)
print(np.unique(hours, return_counts=True))

In [None]:
hourCounts = np.unique(hours, return_counts=True)

plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(2.33,1.5))
plt.bar(hourCounts[0], hourCounts[1], color='darkblue')

#plt.title('Number of Accidents Per Hour', fontsize=16)
plt.ylabel('Number of Accidents', fontsize=9)
plt.xlabel('Time of Day (hours)', fontsize=9)
plt.xticks(np.arange(0, 24, step=3), fontsize=9)
plt.show()


In [None]:
s1hours = df1Severity['Start_Time'].str.split(" ",expand=True)[1]
s1CountsHours = np.unique(pd.to_datetime(s1hours).dt.hour, return_counts=True)
s1label = np.linspace(1,1,len(s1CountsHours[0]))


In [None]:
s2hours = df2Severity['Start_Time'].str.split(" ",expand=True)[1]
s2CountsHours = np.unique(pd.to_datetime(s2hours).dt.hour, return_counts=True)
s2label = np.linspace(2,2,len(s2CountsHours[0]))

In [None]:
s3hours = df3Severity['Start_Time'].str.split(" ",expand=True)[1]
s3CountsHours = np.unique(pd.to_datetime(s3hours).dt.hour, return_counts=True)
s3label = np.linspace(3,3,len(s3CountsHours[0]))

In [None]:
s4hours = df4Severity['Start_Time'].str.split(" ",expand=True)[1]
s4CountsHours = np.unique(pd.to_datetime(s4hours).dt.hour, return_counts=True)
s4label = np.linspace(4,4,len(s4CountsHours[0]))
s4label

In [None]:
plt.figure(figsize=(10,5))

scalar = 1/10
plt.scatter(s1CountsHours[0],s1label,s=s1CountsHours[1]*scalar, color='black')
plt.scatter(s2CountsHours[0],s2label,s=s2CountsHours[1]*scalar, color='orange')
plt.scatter(s3CountsHours[0],s3label,s=s3CountsHours[1]*scalar, color='red')
plt.scatter(s4CountsHours[0],s4label,s=s4CountsHours[1]*scalar, color='darkred')

plt.xticks(np.arange(0, 24, step=1))

plt.title('Severe Accidents per Hour', fontsize=16)
plt.ylabel('The severity of the accident', fontsize=12)
plt.xlabel('Hours in a day (military time)', fontsize=12)

plt.show()

See the distrobution of accidents over each month

## Accidents per Month per Severity Visulization

In [None]:
s1date = df1Severity['Start_Time'].str.split(" ",expand=True)[0]
s1CountsMonths = np.unique(pd.to_datetime(s1date).dt.month, return_counts=True)
s1monthlabel = np.linspace(1,1,len(s1CountsMonths[0]))

In [None]:
s2date = df2Severity['Start_Time'].str.split(" ",expand=True)[0]
s2CountsMonths = np.unique(pd.to_datetime(s2date).dt.month, return_counts=True)
s2monthlabel = np.linspace(2,2,len(s2CountsMonths[0]))


In [None]:
s3date = df3Severity['Start_Time'].str.split(" ",expand=True)[0]
s3CountsMonths = np.unique(pd.to_datetime(s3date).dt.month, return_counts=True)
s3monthlabel = np.linspace(3,3,len(s3CountsMonths[0]))

In [None]:
s4date = df4Severity['Start_Time'].str.split(" ",expand=True)[0]
s4CountsMonths = np.unique(pd.to_datetime(s3date).dt.month, return_counts=True)
s4monthlabel = np.linspace(4,4,len(s3CountsMonths[0]))

In [None]:
plt.figure(figsize=(10,6))

scalar = 1/10
plt.scatter(s1CountsMonths[0],s1monthlabel,s=s1CountsMonths[1]*scalar, color='black')
plt.scatter(s2CountsMonths[0],s2monthlabel,s=s2CountsMonths[1]*scalar, color='orange')
plt.scatter(s3CountsMonths[0],s3monthlabel,s=s3CountsMonths[1]*scalar, color='red')
plt.scatter(s4CountsMonths[0],s4monthlabel,s=s4CountsMonths[1]*scalar, color='darkred')

plt.xticks(np.arange(1, 12, step=1))
plt.yticks(np.arange(1, 5, step=1))

plt.title('Severe Accidents per Month', fontsize=16)
plt.ylabel('The severity of the accident', fontsize=12)
plt.xlabel('Month', fontsize=12)

plt.show()

## Geopandas Spacial Visulizations

In [None]:
!pip install geopandas

In [None]:
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame

In [None]:
states = gpd.read_file(pathToData + 'usa-states-census-2014.shp')
type(states)

# Fix found on : https://stackoverflow.com/questions/38961816/geopandas-set-crs-on-points
states.set_crs(epsg=4326, inplace=True)

# project to merkator
states.to_crs(epsg=3395)

In [None]:
# Based on the tutorial: https://jcutrer.com/python/learn-geopandas-plotting-usmaps and http://jonathansoma.com/lede/foundations-2017/classes/geopandas/mapping-with-geopandas/
geometry = [Point(xy) for xy in zip(df['Start_Lng'], df['Start_Lat'])]
gdf = GeoDataFrame(df, geometry=geometry)   

#this is a simple map that goes with geopandas

gdf.plot(ax=states[states['NAME'] == 'Texas'].plot(color='lightgrey', linewidth=0.5, edgecolor='white', figsize=(2.3, 1.5)), markersize=1, column='Severity', alpha=0.5, cmap='plasma', legend=True)
plt.ylabel('Latitude', fontsize=9)
plt.xlabel('Longitude', fontsize=9)

## Correlation Experiments

I want to measure the correlation between severity and all numeric features. I want to measure the association between categorical features and severity.


References: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/discussion/23849 

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [None]:
np.corrcoef(df['Severity'], df['Distance(mi)'])

In [None]:
tempDF = df.dropna(subset=['Temperature(F)'])
np.corrcoef(tempDF['Severity'], tempDF['Temperature(F)'])

In [None]:
tempDF = df.dropna(subset=['Wind_Chill(F)'])
np.corrcoef(tempDF['Severity'], tempDF['Wind_Chill(F)'])

In [None]:
tempDF = df.dropna(subset=['Humidity(%)'])
np.corrcoef(tempDF['Severity'], tempDF['Humidity(%)'])

In [None]:
tempDF = df.dropna(subset=['Pressure(in)'])
np.corrcoef(tempDF['Severity'], tempDF['Pressure(in)'])

In [None]:
tempDF = df.dropna(subset=['Wind_Speed(mph)'])
np.corrcoef(tempDF['Severity'], tempDF['Wind_Speed(mph)'])

In [None]:
tempDF = df.dropna(subset=['Precipitation(in)'])
np.corrcoef(tempDF['Severity'], tempDF['Precipitation(in)'])

### Experiments with Cramer's V and Categorical Associations

In [None]:
# I learned about Cramer's V at https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# I also borrowed this function from that article
# Measure of association between categorical values
# Similarly to correlation, the output is in the range of [0,1], where 0 means no association and 1 is full association. 
# (Unlike correlation, there are no negative values, as there’s no such thing as a negative association. Either there is, or there isn’t)
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
windDirDF = df.dropna(subset=['Wind_Direction'])
cramers_v(windDirDF['Severity'], windDirDF['Wind_Direction'])


In [None]:
sns.heatmap(pd.crosstab(windDirDF.loc[df['Severity'] == 1]['Severity'], windDirDF.loc[df['Severity'] == 1]['Wind_Direction']))
plt.figure()
sns.heatmap(pd.crosstab(windDirDF.loc[df['Severity'] == 2]['Severity'], windDirDF.loc[df['Severity'] == 2]['Wind_Direction']))
plt.figure()
sns.heatmap(pd.crosstab(windDirDF.loc[df['Severity'] == 3]['Severity'], windDirDF.loc[df['Severity'] == 3]['Wind_Direction']))
plt.figure()
sns.heatmap(pd.crosstab(windDirDF.loc[df['Severity'] == 4]['Severity'], windDirDF.loc[df['Severity'] == 4]['Wind_Direction']))
plt.figure()

In [None]:
tempDF = df.dropna(subset=['Sunrise_Sunset'])
cramers_v(tempDF['Severity'], tempDF['Sunrise_Sunset'])

In [None]:
tempDF = df.dropna(subset=['Civil_Twilight'])
cramers_v(tempDF['Severity'], tempDF['Civil_Twilight'])

In [None]:
tempDF = df.dropna(subset=['Nautical_Twilight'])
cramers_v(tempDF['Severity'], tempDF['Nautical_Twilight'])

In [None]:
tempDF = df.dropna(subset=['Astronomical_Twilight'])
cramers_v(tempDF['Severity'], tempDF['Astronomical_Twilight'])

### Dython Association comparisons

In [None]:
df['Severity'] = df['Severity'].astype(str)

In [None]:
from dython.nominal import associations
associations(df.loc[:,('Severity', 'Civil_Twilight', 'Sunrise_Sunset', 'Nautical_Twilight', 'Astronomical_Twilight')])

In [None]:
associations(df.loc[:,('Severity', 'Amenity', 'Bump', 'Give_Way', 'Junction', 'No_Exit','Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal')], figsize=(10,10))

In [None]:
associations(df.loc[:,('Severity','Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction','Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition')], figsize=(10,10))

In [None]:
associations(df.loc[:,('Severity', 'Airport_Code', 'Timezone', 'County', 'City', 'Side')], figsize=(10,10))

In [None]:
df['Duration_Length'] = (pd.to_datetime(df['End_Time'])-pd.to_datetime(df['Start_Time'])).astype('timedelta64[m]')
associations(df.loc[:,('Severity', 'Distance(mi)', 'Start_Lat', 'Start_Lng', 'Duration_Length','TMC', 'Source')], figsize=(10,10))  

In [None]:
associations(df.loc[:,('Severity', 'Distance(mi)', 'Start_Lat', 'Start_Lng', 'Duration_Length','TMC', 'Source')], figsize=(10,10))  

It seems the geographical location features were most correlated/associated with severity. Factors such as distance, side, weather condition, and the location of the accident being a traffic stop also had a correlation/association value above |.1|. Correlation values range from (-1,1) whereas association values range from (0,1). 


# Prepare Data for Models

After looking at the above dython association comparisons it seems like many of these features are not helpful in predicting severity. I want to plan on using a threshold of .1, .15, and .2 to use only values that have at least an |n| association or correlation with Severity. 

In order to use these values in a model or autoencoder, they must be scaled and prepared to work best with TensorFlow. 

I also want to look at the natural language data features which I was not able to correlate or find association metrics for. I plan to use TFX to gather meaning behind the words as well as a TF-IDF to find potentially important words relating to the accidents and their severity. Applying NLP to the weather condition categories might be interesting as well.

Data that exists at a |.1| threshold or higher includes distance, side, city, state, county, airport code, and weather condition. I expect that city, county, and airport code may provide similar information and may not all be needed. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.utils import shuffle

df = pd.read_csv(pathToData + 'TX_Accidents_June20.csv')

In [None]:
X = df 

In [None]:
import tensorflow as tf

colsWeCareAbout = [ 'Distance(mi)', 'Weather_Condition', 'Airport_Code', 'County', 'City', 'Side', 'Amenity', 'Bump', 'Give_Way', 'Junction', 'No_Exit','Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal']
categoricalCols =  ['Weather_Condition', 'County', 'City', 'Side', 'Airport_Code']
booleanCols = ['Amenity', 'Bump', 'Give_Way', 'Junction', 'No_Exit','Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal']
X_ = X.loc[:,( colsWeCareAbout )]

y = X.loc[:,('Severity')]

In [None]:
from sklearn.model_selection import train_test_split

inpt_dim = X_.shape[1]
x_train, x_test, y_train, y_test = train_test_split(X_, y, test_size=0.4, stratify=y, random_state=42)
x_test, x_validate, y_test, y_validate = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

y_train_values = y_train
y_test_values = y_test
y_test = pd.get_dummies(y_test)
y_train = pd.get_dummies(y_train)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_validate.shape, y_validate.shape)

(197570, 17) (197570, 4)
(65857, 17) (65857, 4)
(65857, 17) (65857,)


In [None]:
x_train

Unnamed: 0,Distance(mi),Weather_Condition,Airport_Code,County,City,Side,Amenity,Bump,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal
259959,0.000,Clear,KRYW,Travis,Lago Vista,L,False,False,False,False,False,False,False,False,False,False,False
60126,0.000,Clear,KAUS,Travis,Austin,L,False,False,False,False,False,False,False,False,False,False,False
23248,0.000,Scattered Clouds,KRYW,Travis,Cedar Park,L,False,False,False,False,False,False,False,False,False,False,False
319455,1.436,Partly Cloudy,KSGR,Harris,Houston,R,False,False,False,False,False,False,False,False,False,False,False
325760,0.477,Partly Cloudy,KRBD,Dallas,Dallas,R,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104360,0.000,Mostly Cloudy,KMCJ,Harris,Houston,R,False,False,False,False,True,False,False,False,True,False,False
185881,0.110,Clear,KATT,Travis,Austin,R,True,False,False,False,False,False,False,True,False,False,True
172898,0.000,Scattered Clouds,KSAT,Bexar,San Antonio,R,False,False,False,False,False,False,False,False,False,False,True
202712,0.000,Partly Cloudy,KAUS,Travis,Austin,L,False,False,False,False,False,False,False,False,False,False,False


In [None]:
y_train_values

259959    2
60126     2
23248     2
319455    4
325760    2
         ..
104360    2
185881    2
172898    2
202712    2
303289    3
Name: Severity, Length: 197570, dtype: int64

## Scaling of Numeric Data

Numeric data passing the |.1| correlation/association threshold:



In [None]:
x_train['Distance(mi)']

259959    0.000
60126     0.000
23248     0.000
319455    1.436
325760    0.477
          ...  
104360    0.000
185881    0.110
172898    0.000
202712    0.000
303289    0.000
Name: Distance(mi), Length: 197570, dtype: float64

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train[['Distance(mi)']] = scaler.fit_transform(x_train[['Distance(mi)']])
x_test[['Distance(mi)']] = scaler.transform(x_test[['Distance(mi)']])
x_validate[['Distance(mi)']] = scaler.transform(x_validate[['Distance(mi)']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [None]:
x_train['Distance(mi)']

259959   -0.173329
60126    -0.173329
23248    -0.173329
319455    2.024026
325760    0.556572
            ...   
104360   -0.173329
185881   -0.005008
172898   -0.173329
202712   -0.173329
303289   -0.173329
Name: Distance(mi), Length: 197570, dtype: float64

## Categorical Data Preparation

One-hot-encoding does not work effectively (spatially) with data such as cities, because there are just so many it breaks the barriers of the ammount of space each one hot encoded vector should be. Instead, the Counting Features approach will be used representing each categorical variable as teo values, its frequency and its average target value.

This will include the preparation of side, city, county, airport code, and weather condition

f_freq = how many occurances of category x / # of rows

f_avg = # of x's in row with target i / # of rows with target i

In [None]:
x_train = pd.get_dummies(x_train, columns = categoricalCols).astype(float)
x_test = pd.get_dummies(x_test, columns = categoricalCols).astype(float)
x_validate = pd.get_dummies(x_validate, columns = categoricalCols).astype(float)

In [None]:
# Ensures one hot encoded columns are the same for the test and train dataset.
# https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data 

missing_cols = set( x_train.columns ) - set( x_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    x_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
x_test = x_test[x_train.columns]


missing_cols = set( x_train.columns ) - set( x_validate.columns )
# Add a missing column in validation set with default value equal to 0
for c in missing_cols:
    x_validate[c] = 0
# Ensure the order of column in the validation set is in the same order than in train set
x_validate = x_validate[x_train.columns]

In [None]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_validate.shape, y_validate.shape)

(197570, 1108) (197570, 4)
(65857, 1108) (65857, 4)
(65857, 1108) (65857,)


https://www.kite.com/python/answers/how-to-convert-a-pandas-dataframe-into-a-list-of-tuples-in-python use this

Why I don't translate my binary features to one hot encoded ones https://stackoverflow.com/questions/43515877/should-binary-features-be-one-hot-encoded 

This VAE was meant to generate more data for accidents with severity 1-4. It doesn't work

In [None]:
from tensorflow.keras import backend as K

def sampling(z_params):
  z_mean, z_log_var = z_params
  batch = K.shape(z_mean)[0]
  dims = K.int_shape(z_mean)[1]
  epsilon = K.random_normal(shape=(batch, dims)) 
  return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [None]:
y_train_tmp = y_train
y_train_tmp = y_train_tmp.rename(columns={1: "Severity_1", 2: "Severity_2", 3: "Severity_3", 4: "Severity_4"}, errors="raise")

y_test_tmp = y_test
y_test_tmp = y_test_tmp.rename(columns={1: "Severity_1", 2: "Severity_2", 3: "Severity_3", 4: "Severity_4"}, errors="raise")

In [None]:
train_14_x = pd.concat([x_train, y_train_tmp], axis=1, sort=False)
train1s = train_14_x[ train_14_x["Severity_1"] == 1]
train4s = train_14_x[ train_14_x["Severity_4"] == 1]

train_14_x = pd.concat([train1s, train4s], axis=0, sort=False)
train_14_x = train_14_x.drop(columns=["Severity_1","Severity_2","Severity_3","Severity_4"])

In [None]:
test_14_x = pd.concat([x_test, y_test_tmp], axis=1, sort=False)

test1s = test_14_x[test_14_x["Severity_1"] == 1]
test4s = test_14_x[test_14_x["Severity_4"] == 1]

test_14_x = pd.concat([test1s, test4s], axis=0, sort=False)

In [None]:
test_14_y = test_14_x[["Severity_1", "Severity_4"]]
test_14_x =test_14_x.drop(columns=["Severity_1","Severity_2","Severity_3","Severity_4"])

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.layers import BatchNormalization, Lambda
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
import numpy as np
from tensorflow.keras.layers import Dropout

inpt_dim = train_14_x.shape[1]
original_dim = inpt_dim
ltnt_dim = 2

input_shape = (inpt_dim, )
intermediate_dim = 512
batch_size = 1024
latent_dim = 128
epochs = 50

inputs = Input(shape=input_shape, name='encoder_input')
l1 = Dense(intermediate_dim, activation='relu')(inputs)
z_mean = Dense(latent_dim, name='z_mean')(l1)
z_log_var = Dense(latent_dim, name='z_log_var')(l1)

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
l2 = Dense(intermediate_dim, activation='relu')(latent_inputs)
outputs = Dense(original_dim, activation='sigmoid')(l2)

decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae_mlp')


Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 1108)]       0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 512)          567808      encoder_input[0][0]              
__________________________________________________________________________________________________
z_mean (Dense)                  (None, 128)          65664       dense[0][0]                      
__________________________________________________________________________________________________
z_log_var (Dense)               (None, 128)          65664       dense[0][0]                      
____________________________________________________________________________________________

# AE for Dimensionality Reduction

This Autoencoder is meant to reduce the dimensionality of the dataset from 1108 dimensions to 256 by using the trained encoder.

In [None]:
from tensorflow import keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from tensorflow.keras.layers import BatchNormalization
from tensorflow.python.keras import regularizers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

inpt_dim =  1108
ltnt_dim = 256

# The data, split between train and test sets:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

inpt_vec = Input(shape=(inpt_dim,))
el1 = Dense(2048, kernel_regularizer=regularizers.l1(0.0001))(inpt_vec)
el2 = Activation('relu')(el1)
el3 = Dense(1024)(el2)
el4 = BatchNormalization()(el3)
el5 = Activation('relu')(el4)
el6 = Dropout(0.2)(el5)

el7 = Dense(512, kernel_regularizer=regularizers.l1(0.0001))(el6)
el8 = Activation('relu')(el7)
el9 = Dense(512)(el8)
el10 = BatchNormalization()(el9)
el11 = Activation('relu')(el10)
el12 = Dropout(0.2)(el11)

el13 = Dense(256, kernel_regularizer=regularizers.l1(0.0001))(el12)
el14 = Activation('relu')(el13)
el15 = Dropout(0.2)(el14)
el16 = Dense(ltnt_dim)(el15)
el17 = BatchNormalization()(el16)
encoder = Activation('tanh')(el17)

# model that takes input and encodes it into the latent space
latent_ncdr = Model(inpt_vec, encoder)

dl1 = Dense(256, kernel_regularizer=regularizers.l1(0.0001))(encoder)
dl2 = BatchNormalization()(dl1)
dl3 = Activation('relu')(dl2)

dl4 = Dropout(0.2)(dl3)
dl5 = Dense(512)(dl4)
dl6 = Activation('relu')(dl5)
dl7 = Dense(512, kernel_regularizer=regularizers.l1(0.0001))(dl6)
dl8 = BatchNormalization()(dl7)
dl9 = Activation('relu')(dl8)

dl10 = Dropout(0.2)(dl9)
dl11 = Dense(1024)(dl10)
dl12 = Activation('relu')(dl11)
dl13 = Dense(2048, kernel_regularizer=regularizers.l1(0.0001))(dl12)
dl14 = BatchNormalization()(dl13)
dl15 = Activation('relu')(dl14)
decoder = Dense(inpt_dim, activation='sigmoid') (dl15)

# model that takes input, encodes it, and decodes it
autoencoder = Model(inpt_vec, decoder)

# setup RMSprop optimizer
opt = keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6, )

# autoencoder.compile(loss='binary_crossentropy', optimizer='adam')
autoencoder.compile(loss='binary_crossentropy', optimizer=opt)

hist = autoencoder.fit(x_train, x_train, epochs=200, batch_size=1000, 
                       shuffle=True, validation_data=(x_test, x_test))

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,6))
plt.plot(hist.history['loss'], color='#785ef0')
plt.plot(hist.history['val_loss'], color='#dc267f')
plt.title('Model reconstruction loss')
plt.ylabel('Brinary Cross-Entropy Loss')
plt.xlabel('Epoch')
plt.legend(['Training Set', 'Test Set'], loc='upper right')
plt.savefig('AEloss.png', dpi=350, bbox_inches='tight')
plt.show()

In [None]:
latent_ncdr.save("latent_ncdr.hdf5")
autoencoder.save("autoencoder.hdf5")

##UMAP Visualizations

In [None]:
from tensorflow import keras
latent_ncdr = keras.models.load_model('latent_ncdr.hdf5')
autoencoder = keras.models.load_model('autoencoder.hdf5')



OSError: ignored

In [None]:
encdd = latent_ncdr.predict(x_validate)
x_hat = autoencoder.predict(x_validate)

NameError: ignored

UMAP Visualization of the encoded validation data

In [None]:
import matplotlib.pyplot as plt
import umap

y_umap = list(map(int, y_validate))

print(encdd.shape)

encdd = umap.UMAP().fit_transform(encdd)
print(encdd.shape)


In [None]:
import seaborn as sns

plt.figure(figsize=(10,8))
sns.set_style("white")

plt.title('UMAP of a 256-Layer Encoder')
plt.scatter(encdd[:,0], encdd[:,1], s=10.0, c=y_umap, alpha=0.75, cmap='inferno')
plt.xlabel('First UMAP dimension')
plt.ylabel('Second UMAP dimension')
plt.colorbar()
plt.savefig('AEumap.png', bbox_inches='tight', dpi=350)

UMAP Visualization of the Encoded Test Data

In [None]:
import matplotlib.pyplot as plt
import umap

y_umap = list(map(int, y_test_values))
encdd = latent_ncdr.predict(x_test)

print(encdd.shape)

encdd = umap.UMAP().fit_transform(encdd)
print(encdd.shape)

In [None]:
import seaborn as sns

plt.figure(figsize=(10,8))
sns.set_style("white")

plt.title('UMAP of a 256-Layer Encoder')
plt.scatter(encdd[:,0], encdd[:,1], s=10.0, c=y_umap, alpha=0.75, cmap='inferno')
plt.xlabel('First UMAP dimension')
plt.ylabel('Second UMAP dimension')
plt.colorbar()
plt.savefig('AEumap.png', bbox_inches='tight', dpi=350)

Umap visualization of the original validation data

In [None]:
import matplotlib.pyplot as plt
import umap

y_umap = list(map(int, y_validate))

print(x_validate.shape)

original = umap.UMAP().fit_transform(x_validate)
print(original.shape)

In [None]:
import seaborn as sns

plt.figure(figsize=(10,8))
sns.set_style("white")

plt.title('UMAP of Orignal Data')
plt.scatter(original[:,0], original[:,1], s=10.0, c=y_umap, alpha=0.75, cmap='inferno')
plt.xlabel('First UMAP dimension')
plt.ylabel('Second UMAP dimension')
plt.colorbar()
plt.savefig('Originalumap.png', bbox_inches='tight', dpi=350)

# Deep Neural Network Model Experiments

##Set Up

### Set up Class Weights

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.layers import BatchNormalization
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
import numpy as np
from tensorflow.keras.layers import Dropout

inpt_dim = x_train.shape[1]

In [None]:
# Code found with https://stackoverflow.com/questions/43481490/keras-class-weights-class-weight-for-one-hot-encoding
# Computes an initial class weight for each severity level based on its representation in the data set

import numpy as np
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', [1,2,3,4], y_train_values)
d_class_weights = dict(enumerate(class_weights))

In [None]:
d_class_weights

{0: 76.93535825545172,
 1: 0.3520391435739537,
 2: 0.9180762081784387,
 3: 17.490262039660056}

### Shared Methods 

Contains a method that plots the loss of a model

In [None]:
import matplotlib.pyplot as plt
def plotLoss(hist):
  fig = plt.figure(figsize=(10,6))
  plt.plot(hist.history['loss'], color='#785ef0')
  plt.plot(hist.history['val_loss'], color='#dc267f')
  plt.title('Model Loss Progress')
  plt.ylabel('Categorical Cross-Entropy Loss')
  plt.xlabel('Epoch')
  plt.legend(['Training Set', 'Test Set'], loc='upper right')
  plt.show()

##Launch Tensorboard

In [None]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [None]:
# Clear any logs from previous runs
! rm -rf ./logs/

In [None]:
import datetime
logdir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

If the tensorboard does not load, change the port value to another random value and rerun the block. 

In [None]:
%tensorboard --logdir logs --host localhost --port 5001

## Hyper-Parameter Tuning Deep Neural Network

Finds the best combination of number of units, dropout value, l2 value, and batch size value for the dense neural network


In [None]:
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([1108, 2*1108,3*1108]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.2, 0.4, 0.6]))
#HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam']))
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([2000, 5000, 10000]))
HP_L2 = hp.HParam('l2', hp.Discrete([0.001, 0.0001]))

METRIC_ERROR = 'BER'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_BATCH_SIZE, HP_L2],
    metrics=[hp.Metric(METRIC_ERROR, display_name='BER')],
)


In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Softmax
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.keras import regularizers

def train_test_model(hparams):
  deepnet = None

  inpt_vec = Input(shape=(1108))

  units = hparams[HP_NUM_UNITS]
  dr = hparams[HP_DROPOUT]

  dl = Dropout(dr)(inpt_vec)
  dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(hparams[HP_L2]))(dl)
  d1 = BatchNormalization()(dl)

  units = units//2
  dr = dr/1.5
  while units > 4:
    dl = Dropout(dr)(dl)
    dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(hparams[HP_L2]))(dl)
    d1 = BatchNormalization()(dl)
    units = units//2
    dr = dr/1.5

  output = Dense(4, activation=tf.nn.softmax)(dl)

  deepnet = Model(inpt_vec, output)

  deepnet.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam')
  deepnet.summary()

  callbacks=[
      tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1), # log metrics
      hp.KerasCallback(logdir, hparams),  # log hparams
      ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20,  
                                 min_delta=1e-4, mode='min'),
      EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
  ],

  hist = deepnet.fit(x_train, y_train, batch_size=hparams[HP_BATCH_SIZE], epochs=200, 
                    callbacks=callbacks, shuffle=True, verbose=0,
                    validation_data=(x_test, y_test), class_weight=d_class_weights)

  plotLoss(hist)
  
  y_hat = deepnet.predict(x_validate)    # we take the neuron with maximum
  print(y_hat)
  y_pred = np.argmax(y_hat, axis=1) + 1 # output as our prediction
  print(y_pred)

  y_true = y_validate   # this is the ground truth
  labels=[1, 2, 3, 4]

  print(classification_report(y_true, y_pred, labels=labels))

  cm = confusion_matrix(y_true, y_pred, labels=labels)
  print(cm)

  ber = 1- balanced_accuracy_score(y_true, y_pred)
  print("ber:", ber)

  return ber


In [None]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    ber = train_test_model(hparams)
    tf.summary.scalar(METRIC_ERROR, ber, step=1)

In [None]:
session_num = 0

for num_units in HP_NUM_UNITS.domain.values:
  for dropout_rate in HP_DROPOUT.domain.values:
    for l2 in HP_L2.domain.values:
      for batch_size in HP_BATCH_SIZE.domain.values:
        hparams = {
            HP_NUM_UNITS: num_units,
            HP_DROPOUT: dropout_rate,
            HP_L2: l2,
            HP_BATCH_SIZE: batch_size,
        }
        run_name = "deep-neural-net-run-%d" % session_num
        print('--- Starting trial: %s' % run_name)
        print({h.name: hparams[h] for h in hparams})
        run('logs/hparam_tuning/' + run_name, hparams)
        session_num += 1


## Hyper Parameter Tuning Deep Neural Network with AutoEncoder Input

In [None]:
from tensorflow import keras
import datetime
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp
latent_ncdr = keras.models.load_model('latent_ncdr.hdf5')

In [None]:
! rm -rf ./logs/ae

In [None]:
logdir = "logs/fit/ae/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
%tensorboard --logdir logs/ae/ --host localhost --port 5010

In [None]:

with tf.summary.create_file_writer('logs/ae/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_BATCH_SIZE, HP_L2],
    metrics=[hp.Metric(METRIC_ERROR, display_name='BER')],
)


In [None]:
x_encdd_train = latent_ncdr.predict(x_train)
x_encdd_test = latent_ncdr.predict(x_test)
x_encdd_validate = latent_ncdr.predict(x_validate)

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Softmax
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.keras import regularizers

def train_test_model(hparams):
  deepnet = None

  inpt_vec = Input(shape=(256))

  units = hparams[HP_NUM_UNITS]
  dr = hparams[HP_DROPOUT]

  dl = Dropout(dr)(inpt_vec)
  dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(hparams[HP_L2]))(dl)
  d1 = BatchNormalization()(dl)

  units = units//2
  dr = dr/1.5
  while units > 4:
    dl = Dropout(dr)(dl)
    dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(hparams[HP_L2]))(dl)
    d1 = BatchNormalization()(dl)
    units = units//2
    dr = dr/1.5

  output = Dense(4, activation=tf.nn.softmax)(dl)

  deepnet = Model(inpt_vec, output)

  deepnet.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam')
  deepnet.summary()

  callbacks=[
      tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1), # log metrics
      hp.KerasCallback(logdir, hparams),  # log hparams
      ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20,  
                                 min_delta=1e-4, mode='min'),
      EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
  ],

  hist = deepnet.fit(x_encdd_train, y_train, batch_size=hparams[HP_BATCH_SIZE], epochs=200, 
                    callbacks=callbacks, shuffle=True, verbose=0,
                    validation_data=(x_encdd_test, y_test), class_weight=d_class_weights)

  plotLoss(hist)
  
  y_hat = deepnet.predict(x_encdd_validate)    # we take the neuron with maximum
  print(y_hat)
  y_pred = np.argmax(y_hat, axis=1) + 1 # output as our prediction
  print(y_pred)

  y_true = y_validate   # this is the ground truth
  labels=[1, 2, 3, 4]

  print(classification_report(y_true, y_pred, labels=labels))

  cm = confusion_matrix(y_true, y_pred, labels=labels)
  print(cm)

  ber = 1- balanced_accuracy_score(y_true, y_pred)
  print("ber:", ber)

  return ber

In [None]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    ber = train_test_model(hparams)
    tf.summary.scalar(METRIC_ERROR, ber, step=1)

In [None]:
session_num = 0

for num_units in HP_NUM_UNITS.domain.values:
  for dropout_rate in HP_DROPOUT.domain.values:
    for l2 in HP_L2.domain.values:
      for batch_size in HP_BATCH_SIZE.domain.values:
        hparams = {
            HP_NUM_UNITS: num_units,
            HP_DROPOUT: dropout_rate,
            HP_L2: l2,
            HP_BATCH_SIZE: batch_size,
        }
        run_name = "AE-deep-neural-net-run-%d" % session_num
        print('--- Starting trial: %s' % run_name)
        print({h.name: hparams[h] for h in hparams})
        run('logs/ae/hparam_tuning/' + run_name, hparams)
        session_num += 1


## Deep Neural Network with AutoEncoder Input

In [None]:
from tensorflow import keras
latent_ncdr = keras.models.load_model('latent_ncdr.hdf5')

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Softmax
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.keras import regularizers

num_units= 1108
dropout = 0.2
batch_size = 10000

deepnet = None

x_encdd_train = latent_ncdr.predict(x_train)
x_encdd_test = latent_ncdr.predict(x_test)
x_encdd_validate = latent_ncdr.predict(x_validate)

inpt_vec = Input(shape=(256))

units = num_units
dr =dropout

dl = Dropout(dr)(inpt_vec)
dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l1(0.0001))(dl)
d1 = BatchNormalization()(dl)

units = units//2
dr = dr/1.5
while units > 4:
  dl = Dropout(dr)(dl)
  dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l1(0.0001))(dl)
  d1 = BatchNormalization()(dl)
  units = units//2
  dr = dr/1.5

output = Dense(4, activation=tf.nn.softmax)(dl)

deepnet = Model(inpt_vec, output)

deepnet.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam')
deepnet.summary()

callbacks=[
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20,  
                                 min_delta=1e-4, mode='min'),
    EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
],

hist = deepnet.fit(x_encdd_train, y_train, batch_size=batch_size, epochs=200, 
                    callbacks=callbacks, shuffle=True, verbose=0,
                    validation_data=(x_encdd_test, y_test), class_weight=d_class_weights)

plotLoss(hist)
  
y_hat = deepnet.predict(x_encdd_validate)    # we take the neuron with maximum
print(y_hat)
y_pred = np.argmax(y_hat, axis=1) + 1 # output as our prediction
print(y_pred)

print(np.unique(y_pred))

y_true = y_validate   # this is the ground truth
labels=[1, 2, 3, 4]

print(classification_report(y_true, y_pred, labels=labels))

cm = confusion_matrix(y_true, y_pred, labels=labels)
print(cm)

ber = 1- balanced_accuracy_score(y_true, y_pred)
print("ber:", ber)

deepnet.save("deepnetWithAutoEncoderData.hdf5")

## Final Model - no class weights

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Softmax
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.keras import regularizers

num_units= 1108
dropout = 0.2
batch_size = 5000

deepnet = None

inpt_vec = Input(shape=(1108))

units = num_units
dr =dropout

dl = Dropout(dr)(inpt_vec)
dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(dl)
d1 = BatchNormalization()(dl)

units = units//2
dr = dr/1.5
while units > 4:
  dl = Dropout(dr)(dl)
  dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(dl)
  d1 = BatchNormalization()(dl)
  units = units//2
  dr = dr/1.5

output = Dense(4, activation=tf.nn.softmax)(dl)

deepnet = Model(inpt_vec, output)

deepnet.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam')
deepnet.summary()

callbacks=[
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20,  
                                 min_delta=1e-4, mode='min'),
    EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
],

hist = deepnet.fit(x_train, y_train, batch_size=batch_size, epochs=200, 
                    callbacks=callbacks, shuffle=True, verbose=0,
                    validation_data=(x_test, y_test))

plotLoss(hist)
  
y_hat = deepnet.predict(x_validate)    # we take the neuron with maximum
print(y_hat)
y_pred = np.argmax(y_hat, axis=1) + 1 # output as our prediction
print(y_pred)

print(np.unique(y_pred))

y_true = y_validate   # this is the ground truth
print(y_validate)
print(y_true)
labels=[1, 2, 3, 4]

print(classification_report(y_true, y_pred, labels=labels))

cm = confusion_matrix(y_true, y_pred, labels=labels)
print(cm)

ber = 1- balanced_accuracy_score(y_true, y_pred)
print("ber:", ber)

deepnet.save("finalModelNoClassWeights.hdf5")

# Final Model

In [None]:
num_units= 1108
dropout = 0.2
batch_size = 5000

deepnet = None

inpt_vec = Input(shape=(1108))

units = num_units
dr =dropout

dl = Dropout(dr)(inpt_vec)
dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(dl)
d1 = BatchNormalization()(dl)

units = units//2
dr = dr/1.5
while units > 4:
  dl = Dropout(dr)(dl)
  dl = Dense(units, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(dl)
  d1 = BatchNormalization()(dl)
  units = units//2
  dr = dr/1.5

output = Dense(4, activation=tf.nn.softmax)(dl)

deepnet = Model(inpt_vec, output)

deepnet.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam')
deepnet.summary()

callbacks=[
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20,  
                                 min_delta=1e-4, mode='min'),
    EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
],

hist = deepnet.fit(x_train, y_train, batch_size=batch_size, epochs=200, 
                    callbacks=callbacks, shuffle=True, verbose=0,
                    validation_data=(x_test, y_test), class_weight=d_class_weights)

plotLoss(hist)
  
y_hat = deepnet.predict(x_validate)    # we take the neuron with maximum
print(y_hat)
y_pred = np.argmax(y_hat, axis=1) + 1 # output as our prediction
print(y_pred)

y_true = y_validate   # this is the ground truth
labels=[1, 2, 3, 4]

print(classification_report(y_true, y_pred, labels=labels))

cm = confusion_matrix(y_true, y_pred, labels=labels)
print(cm)

ber = 1- balanced_accuracy_score(y_true, y_pred)
print("ber:", ber)

deepnet.save("FinalModel.hdf5")

NameError: ignored

## 