In [924]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool
import seaborn as sns
from bokeh.palettes import Spectral10, Category10
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import seaborn as sns
output_notebook()

In [786]:
df=pd.read_csv("CT_Accidents.csv", low_memory=False)

The original dataset included 49 states worth of accident data, so I restricted it to CT and saved it as a new csv so we don't have to load a 1Gb csv each time we open it. 

In [389]:
def comparison_plot(x,Y, Yhat):
    '''Plots Predicted vs True values for analysis of regression'''
    comparison_plot = figure(title='Difference between Measured and Predicted Values')
    comparison_plot.xaxis.axis_label='x'
    comparison_plot.yaxis.axis_label='Yhat-Y'
    comparison_plot.scatter(x=x,y=Yhat-Y)
    comparison_plot.line(x=[x.min(),x.max()],y=[0,0])
    return comparison_plot

This is a function that will be useful for our linear progression portion.

In [787]:
df.replace({False : int(0), True : int(1)}, inplace=True)

Lots of the features are boolean, so I just switched them to integers. 

In [788]:
df.replace({'Fair':int(0),'Clear':int(0), 
            'Partly Cloudy':int(1),'Scattered Clouds':int(1),'Haze':int(1),'Patches of Fog':int(1),
            'Cloudy':int(2), 'Mostly Cloudy':int(2), 'Overcast':int(2), 'Fair / Windy':int(2),'Mist':int(2),'Smoke':int(2),'Haze / Windy':int(2),
            'Light Rain':int(3),'Cloudy / Windy':int(3),'Fog':int(3), 'Light Drizzle':int(3), 'Thunder in the Vicinity':int(3),'Drizzle':int(3),'Light Freezing Fog':int(3),
            'Light Rain with Thunder':int(3),'Partly Cloudy / Windy':int(3),'Mostly Cloudy / Windy':int(3),'Fog / Windy':int(3),'Heavy Drizzle':int(3),'N/A Precipitation':int(3),
            'Rain':int(4),'Light Snow':int(4),'Heavy Rain':int(4),'Light Rain / Windy':int(4),'Snow':int(4),'Wintry Mix':int(4),'T-Storm':int(4),
            'Heavy T-Storm':int(4), 'Heavy Snow':int(4), 'Snow / Windy': int(4),'Heavy Rain / Windy':int(4),'Rain / Windy':int(4),'Thunder':int(4),
            'Light Freezing Rain':int(4),'Heavy T-Storm / Windy':int(4),'Light Sleet':int(4),'Ice Pellets':int(4),'Heavy Thunderstorms and Rain':int(4),
            'Light Thunderstorms and Rain':int(4), 'Thunderstorms and Rain':int(4),'Heavy Thunderstorms and Snow':int(4),'Light Ice Pellets':int(4),
            'Light Freezing Drizzle':int(4),'Heavy Snow / Windy':int(4),'T-Storm / Windy':int(4),'Thunderstorm':int(4),'Light Snow / Windy':int(4)}, inplace=True)
df['Weather_Condition'].value_counts()

0.0    13463
2.0     9138
3.0     2822
1.0     2691
4.0     1523
Name: Weather_Condition, dtype: int64

In [740]:
print(df['Wind_Chill(F)'].value_counts())
print(df['Wind_Chill(F)'].isnull().sum())



73.0    557
72.0    527
70.0    486
71.0    465
66.0    462
       ... 
20.6      1
30.2      1
11.6      1
21.5      1
12.2      1
Name: Wind_Chill(F), Length: 420, dtype: int64
4760


I've classified the weather condition features. These are kind of up to interpretation, and actually ended up making the score of the logistic regression slightly less accurate. These should probably be tweaked; then again, maybe there are other feature columns that really ought not be in our logistic regression that is throwing off our data. This dropped the score about 1-2%, and while its still in a decent range getting our accuracy closer to 75% would be best I think.

In [789]:
df['Sunrise_Sunset']=df['Sunrise_Sunset'].replace('Day',int(0))
df['Sunrise_Sunset']=df['Sunrise_Sunset'].replace('Night',int(1))
df.rename(columns={'Sunrise_Sunset':'Night'}, inplace=True)
df.iloc[:,44].value_counts()


0.0    18624
1.0    11125
Name: Night, dtype: int64

The 'Sunrise_Sunset' feature actually just tells us if something is night or day. These have been switched to integers.

In [790]:
df['Precipitation(in)']=df['Precipitation(in)']*10
df.rename(columns={'Precipitation(in)':'Precipitation(tenth_in)'}, inplace=True)


for i in df.columns:
    if df[i].dtype == float:
        df[i] = df[i].round().astype('Int64')

Here I switched the 'Precipitation' feature to be measured in tenths of an inch. This is because most of these values in inches are floats in a range of 0-2 inches, so when they are rounded and changed to integers the data is simplified to  values of either 0,1, or 2. By changing these values to tenths, we retain more accuracy in our data. This may be something we want to do with some other features, but I have not gone through all of them yet.

The loop is just changing every float value in each column to an integer.

In [736]:
rain = df.copy(deep=True)
print(rain['Precipitation(tenth_in)'].mean())
rain['Precipitation(tenth_in)']=df['Precipitation(tenth_in)'].replace(int(0),np.nan)

rain = rain.dropna(subset=['Precipitation(tenth_in)'])


rain['Precipitation(tenth_in)'].mean()



2.989315234191251


79.6490599820949

In [791]:
df['Precipitation(tenth_in)'] = df['Precipitation(tenth_in)'].fillna(value=1000)
df.loc[(df['Weather_Condition']==int(4)) & (df['Precipitation(tenth_in)']==1000)  ,'Precipitation(tenth_in)'] = int(2)
df.loc[(df['Weather_Condition']!=int(4)) & (df['Precipitation(tenth_in)']==1000)  ,'Precipitation(tenth_in)'] = int(0)


In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29762 entries, 0 to 29761
Data columns (total 48 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               29762 non-null  int64 
 1   ID                       29762 non-null  object
 2   Severity                 29762 non-null  int64 
 3   Start_Time               29762 non-null  object
 4   End_Time                 29762 non-null  object
 5   Start_Lat                29762 non-null  Int64 
 6   Start_Lng                29762 non-null  Int64 
 7   End_Lat                  29762 non-null  Int64 
 8   End_Lng                  29762 non-null  Int64 
 9   Distance(mi)             29762 non-null  Int64 
 10  Description              29762 non-null  object
 11  Number                   3630 non-null   Int64 
 12  Street                   29762 non-null  object
 13  Side                     29762 non-null  object
 14  City                     29762 non-nul

In [839]:
data=np.genfromtxt('CT_Accidents.csv',delimiter=',',skip_header=1)


In [969]:
features = df.columns[[2,22,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44]]
data = df[features].dropna()
data[features].to_csv(r"C:\Users\kylep\Desktop\MATH_3094\Project\data.csv")




In [970]:
data.head()

Unnamed: 0,Severity,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(tenth_in),Weather_Condition,Amenity,Bump,...,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Night
0,0,48,100,30,3,6,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,48,100,30,3,5,0,3,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,49,83,30,10,8,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,49,93,30,10,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,57,47,30,10,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [971]:
data_features = ['Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)','Precipitation(tenth_in)','Weather_Condition',
'Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Turning_Loop','Night']
x = data.loc[:,data_features].values
y = data.loc[:,['Severity']].values

In [972]:
data = data.to_numpy(dtype=int)


In [973]:
data_centered = data - np.mean(data,axis=0)
np.mean(data_centered,axis=0)

array([ 3.18539753e-17, -1.21045106e-15, -2.93056572e-15, -8.32185104e-16,
        6.35088632e-16, -3.66320716e-16, -5.35047241e-18,  1.45333762e-16,
        3.11073977e-20,  0.00000000e+00,  2.48859182e-18, -8.55453437e-20,
       -1.54292693e-17, -7.77684943e-21,  1.10431262e-18,  0.00000000e+00,
       -2.64412881e-19, -1.24429591e-18,  0.00000000e+00, -2.98631018e-18,
        0.00000000e+00,  1.79178611e-17])

In [974]:
data_centered = StandardScaler().fit_transform(data_centered)


In [975]:
secret_label = data[:,0]



I think this is how I tag severity to be what gets colored in the scatter plot below.

In [976]:
data[:3,:]


array([[  0,  48, 100,  30,   3,   6,   0,   3,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,  48, 100,  30,   3,   5,   0,   3,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   1],
       [  0,  49,  83,  30,  10,   8,   0,   1,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   1]])

In [977]:
P=PCA(n_components=2)
PC = P.fit_transform(data_centered)
PC.shape

(28552, 2)

In [978]:
colors=['red','green','blue','orange','black']
color_list = [colors[int(secret_label[i])] for i in range(20000)]

scatter_plot=figure(title='sklearn version')
scatter_plot.scatter(x=PC[:,0],y=PC[:,1],color=color_list)
show(scatter_plot)



In [968]:
P.components_


array([[-8.47887059e-03, -1.45084262e-01,  5.20645924e-01,
        -1.67446809e-01, -5.33444304e-01, -6.26319266e-02,
         2.92550906e-01,  5.27445154e-01, -1.10884193e-02,
        -1.61558713e-27, -7.39605373e-03, -5.17294628e-03,
         6.22042152e-03, -5.87858945e-03, -1.29580831e-03,
        -0.00000000e+00,  7.93690717e-03, -9.42411628e-03,
        -0.00000000e+00,  4.59767113e-03, -0.00000000e+00,
         1.64455936e-01],
       [ 1.38304692e-01, -3.37492359e-01,  1.31149307e-01,
         1.09277423e-01,  8.88093303e-02, -3.87869031e-01,
        -2.20340351e-01, -1.54247815e-01,  8.85748685e-02,
        -4.23516474e-22,  3.72183815e-01,  3.59981210e-02,
        -2.86263886e-02,  3.95893193e-02,  2.87394727e-01,
         0.00000000e+00,  2.47168265e-01,  1.73189279e-01,
         0.00000000e+00,  2.82603007e-01,  0.00000000e+00,
         4.51177867e-01]])

In [892]:
D = np.dot(data_centered.transpose(),data_centered)/200
def r(i,j):
    return D[i,j]/np.sqrt(D[i,i]*D[j,j])
for i in range(15):
    print(i, r(0,i))

0 1.0
1 -0.028804909844240664
2 -0.009538904982592952
3 0.004009865248472393
4 0.01610883232836885
5 0.03844770144100919
6 -0.011844170424250181
7 0.005432622450085898
8 0.026084942315820682
9 nan
10 0.05673496724111351
11 0.013729322175549175
12 -0.04182849297838616
13 -0.007689718455288791
14 -0.004208441802148157


  return D[i,j]/np.sqrt(D[i,i]*D[j,j])


In [893]:
L, P = np.linalg.eigh(D)
L = L[::-1]

In [894]:
eigenvalue_plot = figure(title='Eigenvalues of Covariance Matrix')
eigenvalue_plot.scatter(x=range(L.shape[0]),y=L,size=8)
eigenvalue_plot.line(x=range(L.shape[0]),y=L,color='cyan')
show(eigenvalue_plot)

In [896]:
PC2 = np.dot(data_centered,P[:,-2::-1])

In [902]:
colors=['red','green','blue','orange','black']
color_list = [colors[int(secret_label[i])] for i in range(20000)]
scatter_plot = figure(title='Plot of First Two Principal Components with secret labels',x_range=(-3,3),y_range=(-3,3))
scatter_plot.scatter(x=PC2[:,0],y=PC2[:,1],color=color_list)
show(scatter_plot)



In [903]:

scatter_plot = figure(title='Plot of First Two Principal Components',x_range=(-3,3),y_range=(-3,3))
scatter_plot.scatter(x=PC2[:,0],y=PC2[:,1],color=color_list)
show(scatter_plot)



Above I just quickly put together a data variable that takes our dataframe and formats it into arrays. I think this will be a necessary first step in preparing the data for Linear Regression.

In [955]:
df['Severity']=df['Severity'].replace(int(1),int(0))
df['Severity']=df['Severity'].replace(int(2),int(0))
df['Severity']=df['Severity'].replace(int(3),int(1))
df['Severity']=df['Severity'].replace(int(4),int(1))

df['Severity'].value_counts()


0    24018
1     4534
Name: Severity, dtype: int64

In [638]:
df['Severity'].value_counts()

0    24766
1     4996
Name: Severity, dtype: int64

Here I have reformated the 'Severity' feature that can act as our target. While we can use the original 4 classes for Linear Regression, restricting it to 2 will let us do our Binary Logistic Regression. I just did this after the Linear & PCA section, but we could always just make a new dataset out of this parameter.

In the original format, there are only 4 accidents with 'Severity'=1, so switching the classes to just two really shouldnt skew the data much at all. 

In [792]:
df = df.dropna(subset=['Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)','Weather_Condition','Night'])
print(df.isnull().values.any())


True


There's a good amount of nan values in each of our numerical feature columns. Here I just dropped every row with them, but we can assess which variables may be better to replace nan values with the average of that feature.

In [711]:
features = df.columns[[22,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44]]
L=LogisticRegression(max_iter=10000, solver='lbfgs')
x_train, x_test, y_train, y_test = train_test_split(df[features].values, df['Severity'].values)
L.fit(x_train,y_train)

LogisticRegression(max_iter=10000)

I just through in the numerical values into the logistic regression.

In [712]:
L.score(x_test,y_test)

0.8464555898010647

In [683]:
L.intercept_

array([-1.13507635])

In [685]:
L.coef_

array([[-0.00230053, -0.00314791, -0.02331955,  0.01753462,  0.02610082,
        -0.05054402,  0.0607984 ,  0.50487024,  0.        ,  0.51452048,
         0.74406656, -0.27290744, -0.38815108, -1.20794382,  0.        ,
        -0.30070735,  1.5044522 ,  0.        ,  1.40297067,  0.        ,
         0.062982  ]])

Around 70% is an okay start but hopefully we can improve. Another thing I haven't done yet is mess with the features that can be considered dependent variable. There might not be any though, since it looks like the variables dealing with time of day have been organized so there's probably no overlap. I havent formated those types of features to numerical values yet either. I feel like once we include those, our logistic regression should improve as well.

In [676]:
df[features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28552 entries, 0 to 29761
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Severity                 28552 non-null  int64
 1   Temperature(F)           28552 non-null  Int64
 2   Humidity(%)              28552 non-null  Int64
 3   Pressure(in)             28552 non-null  Int64
 4   Visibility(mi)           28552 non-null  Int64
 5   Wind_Speed(mph)          28552 non-null  Int64
 6   Precipitation(tenth_in)  28552 non-null  Int64
 7   Weather_Condition        28552 non-null  Int64
 8   Amenity                  28552 non-null  int64
 9   Bump                     28552 non-null  int64
 10  Crossing                 28552 non-null  int64
 11  Give_Way                 28552 non-null  int64
 12  Junction                 28552 non-null  int64
 13  No_Exit                  28552 non-null  int64
 14  Railway                  28552 non-null  int64
 15  Ro