In [1]:
import numpy as np
import pandas as pd
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [286]:
df=pd.read_csv("CT_Accidents.csv", low_memory=False)

The original dataset included 49 states worth of accident data, so I restricted it to CT and saved it as a new csv so we don't have to load a 1Gb csv each time we open it. 

In [94]:
def comparison_plot(x,Y, Yhat):
    '''Plots Predicted vs True values for analysis of regression'''
    comparison_plot = figure(title='Difference between Measured and Predicted Values')
    comparison_plot.xaxis.axis_label='x'
    comparison_plot.yaxis.axis_label='Yhat-Y'
    comparison_plot.scatter(x=x,y=Yhat-Y)
    comparison_plot.line(x=[x.min(),x.max()],y=[0,0])
    return comparison_plot

This is a function that will be useful for our linear progression portion.

In [262]:
df.replace({False : int(0), True : int(1)}, inplace=True)

Lots of the features are boolean, so I just switched them to integers. 

In [263]:
df.replace({'Fair':int(0),'Clear':int(0), 
            'Partly Cloudy':int(1),'Scattered Clouds':int(1),'Haze':int(1),'Patches of Fog':int(1),
            'Cloudy':int(2), 'Mostly Cloudy':int(2), 'Overcast':int(2), 'Fair / Windy':int(2),'Mist':int(2),'Smoke':int(2),'Haze / Windy':int(2),
            'Light Rain':int(3),'Cloudy / Windy':int(3),'Fog':int(3), 'Light Drizzle':int(3), 'Thunder in the Vicinity':int(3),'Drizzle':int(3),'Light Freezing Fog':int(3),
            'Light Rain with Thunder':int(3),'Partly Cloudy / Windy':int(3),'Mostly Cloudy / Windy':int(3),'Fog / Windy':int(3),'Heavy Drizzle':int(3),'N/A Precipitation':int(3),
            'Rain':int(4),'Light Snow':int(4),'Heavy Rain':int(4),'Light Rain / Windy':int(4),'Snow':int(4),'Wintry Mix':int(4),'T-Storm':int(4),
            'Heavy T-Storm':int(4), 'Heavy Snow':int(4), 'Snow / Windy': int(4),'Heavy Rain / Windy':int(4),'Rain / Windy':int(4),'Thunder':int(4),
            'Light Freezing Rain':int(4),'Heavy T-Storm / Windy':int(4),'Light Sleet':int(4),'Ice Pellets':int(4),'Heavy Thunderstorms and Rain':int(4),
            'Light Thunderstorms and Rain':int(4), 'Thunderstorms and Rain':int(4),'Heavy Thunderstorms and Snow':int(4),'Light Ice Pellets':int(4),
            'Light Freezing Drizzle':int(4),'Heavy Snow / Windy':int(4),'T-Storm / Windy':int(4),'Thunderstorm':int(4),'Light Snow / Windy':int(4)}, inplace=True)
df['Weather_Condition'].value_counts()

0.0    13463
2.0     9138
3.0     2822
1.0     2691
4.0     1523
Name: Weather_Condition, dtype: int64

I've classified the weather condition features. These are kind of up to interpretation, and actually ended up making the score of the logistic regression slightly less accurate. These should probably be tweaked; then again, maybe there are other feature columns that really ought not be in our logistic regression that is throwing off our data. This dropped the score about 1-2%, and while its still in a decent range getting our accuracy closer to 75% would be best I think.

In [289]:

df['Civil_Twilight'].value_counts()
df['Sunrise_Sunset'].value_counts()


Day      18624
Night    11125
Name: Sunrise_Sunset, dtype: int64

In [266]:
df['Sunrise_Sunset']=df['Sunrise_Sunset'].replace('Day',int(0))
df['Sunrise_Sunset']=df['Sunrise_Sunset'].replace('Night',int(1))
df.rename(columns={'Sunrise_Sunset':'Night'}, inplace=True)
df.iloc[:,44].value_counts()


The 'Sunrise_Sunset' feature actually just tells us if something is night or day. These have been switched to integers.

In [272]:
df['Precipitation(in)']=df['Precipitation(in)']*10
df.rename(columns={'Precipitation(in)':'Precipitation(tenth_in)'}, inplace=True)


for i in df.columns:
    if df[i].dtype == float:
        df[i] = df[i].round().astype('Int64')

Here I switched the 'Precipitation' feature to be measured in tenths of an inch. This is because most of these values in inches are floats in a range of 0-2 inches, so when they are rounded and changed to integers the data is simplified to  values of either 0,1, or 2. By changing these values to tenths, we retain more accuracy in our data. This may be something we want to do with some other features, but I have not gone through all of them yet.

The loop is just changing every float value in each column to an integer.

In [154]:
df['Precipitation(tenth_in)'].value_counts()

0     22953
1       594
2       234
3        52
4        39
5        36
8        15
7        11
6        10
22        2
14        1
13        1
11        1
9         1
Name: Precipitation(tenth_in), dtype: Int64

In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29762 entries, 0 to 29761
Data columns (total 48 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               29762 non-null  int64 
 1   ID                       29762 non-null  object
 2   Severity                 29762 non-null  int64 
 3   Start_Time               29762 non-null  object
 4   End_Time                 29762 non-null  object
 5   Start_Lat                29762 non-null  Int64 
 6   Start_Lng                29762 non-null  Int64 
 7   End_Lat                  29762 non-null  Int64 
 8   End_Lng                  29762 non-null  Int64 
 9   Distance(mi)             29762 non-null  Int64 
 10  Description              29762 non-null  object
 11  Number                   3630 non-null   Int64 
 12  Street                   29762 non-null  object
 13  Side                     29762 non-null  object
 14  City                     29762 non-nul

In [26]:
data=np.genfromtxt('CT_Accidents.csv',delimiter=',',skip_header=1)

In [32]:
data.shape

(29762, 48)

Above I just quickly put together a data variable that takes our dataframe and formats it into arrays. I think this will be a necessary first step in preparing the data for Linear Regression.

In [273]:
df['Severity']=df['Severity'].replace(int(1),int(0))
df['Severity']=df['Severity'].replace(int(2),int(0))
df['Severity']=df['Severity'].replace(int(3),int(1))
df['Severity']=df['Severity'].replace(int(4),int(1))

df['Severity'].value_counts()


0    24766
1     4996
Name: Severity, dtype: int64

In [164]:
df['Visibility(tenth_mi)'].value_counts()

100    24090
90       695
20       632
80       558
30       556
10       555
70       542
50       455
40       443
60       422
2        218
5        174
8        101
25        79
15        40
12        33
18        31
0         26
1         12
9          1
Name: Visibility(tenth_mi), dtype: Int64

Here I have reformated the 'Severity' feature that can act as our target. While we can use the original 4 classes for Linear Regression, restricting it to 2 will let us do our Binary Logistic Regression. I just did this after the Linear & PCA section, but we could always just make a new dataset out of this parameter.

In the original format, there are only 4 accidents with 'Severity'=1, so switching the classes to just two really shouldnt skew the data much at all. 

In [274]:
df= df.dropna()
print(df.isnull().values.any())


False


There's a good amount of nan values in each of our numerical feature columns. Here I just dropped every row with them, but we can assess which variables may be better to replace nan values with the average of that feature.

In [280]:
features = df.columns[[22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44]]
L=LogisticRegression(max_iter=10000, solver='lbfgs')
x_train, x_test, y_train, y_test = train_test_split(df[features].values, df['Severity'].values)
L.fit(x_train,y_train)

LogisticRegression(max_iter=10000)

I just through in the numerical values into the logistic regression.

In [281]:
L.score(x_test,y_test)

0.6827371695178849

Around 70% is an okay start but hopefully we can improve. Another thing I haven't done yet is mess with the features that can be considered dependent variable. There might not be any though, since it looks like the variables dealing with time of day have been organized so there's probably no overlap. I havent formated those types of features to numerical values yet either. I feel like once we include those, our logistic regression should improve as well.

In [207]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,31581,A-31582,2,2016-11-30 15:58:59,2016-11-30 21:58:59,41.77461,-72.57781,41.76576,-72.62613,2.564,...,False,False,False,False,False,False,Day,Day,Day,Day
1,31603,A-31604,2,2016-11-30 16:53:13,2016-11-30 22:53:13,41.73486,-72.66351,41.72755,-72.66619,0.524,...,False,False,False,False,False,False,Night,Night,Day,Day
2,31738,A-31739,2,2016-12-01 06:26:05,2016-12-01 12:26:05,41.10143,-73.43527,41.10643,-73.41528,1.097,...,False,False,False,False,False,False,Night,Night,Day,Day
3,31793,A-31794,2,2016-12-01 08:05:04,2016-12-01 14:05:04,41.28436,-72.94815,41.2692,-72.97343,1.679,...,False,False,False,False,False,False,Day,Day,Day,Day
4,31896,A-31897,2,2016-12-01 11:54:46,2016-12-01 17:54:46,41.15443,-73.24525,41.16724,-73.22949,1.206,...,False,False,False,False,False,False,Day,Day,Day,Day
