In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn import metrics
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
from sklearn.preprocessing import Imputer

In [2]:
data = pd.read_csv("MVC.csv")
print(data.columns)

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5',
       'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'NUMTIME'],
      dtype='object')


In [3]:
feature_cols = data.columns[4]
resp_col = data.columns[11]
x = data[feature_cols].copy()
y = data[resp_col].copy()

In [4]:
x=x>0
y=y>0
x = x.dropna()
y = y.dropna()
pred = np.array(x)
response = np.array(y)
pred = pred.reshape(-1,1)
response = response.reshape(-1,1)
print(pred.shape)
print(response.shape)

(1048575, 1)
(1048575, 1)


In [5]:
linreg = LinearRegression()
linreg.fit(pred,response)
end = linreg.predict(pred)
print("The slope is: ",linreg.coef_)
print("The y-intercept is",linreg.intercept_)
print(metrics.r2_score(y,end))
print(metrics.mean_squared_error(y, end))

The slope is:  [[-0.00016556]]
The y-intercept is [0.0011991]
2.3397311851303115e-06
0.001048891477098241


In [6]:
def borough(x):
    if x=="QUEENS":
        return 1
    if x=='BROOKLYN':
        return 2
    if x=='MANHATTAN':
        return 3
    if x=='BRONX':
        return 4
    if x=='STATEN ISLAND':
        return 5

In [7]:
data['BOROUGH']=data['BOROUGH'].apply(borough)
print(borough)

<function borough at 0x1a3859b710>


In [8]:
valuablecols=['NUMTIME','BOROUGH']
def replace_missing_value(data, valuablecols):

    imputer = Imputer(strategy="median")
    df_num = data[valuablecols]
    imputer.fit(df_num)
    X = imputer.transform(df_num)
    res_def = pd.DataFrame(X, columns=df_num.columns)
    return res_def
data["NUMTIME"] = replace_missing_value(data,["NUMTIME"])
data = data.dropna(how='any', subset=['BOROUGH','NUMTIME','NUMBER OF PERSONS INJURED'])
print(data.isnull().any())



CRASH DATE                       False
CRASH TIME                       False
BOROUGH                          False
ZIP CODE                          True
LATITUDE                          True
LONGITUDE                         True
LOCATION                          True
ON STREET NAME                    True
CROSS STREET NAME                 True
OFF STREET NAME                   True
NUMBER OF PERSONS INJURED        False
NUMBER OF PERSONS KILLED          True
NUMBER OF PEDESTRIANS INJURED    False
NUMBER OF PEDESTRIANS KILLED     False
NUMBER OF CYCLIST INJURED        False
NUMBER OF CYCLIST KILLED         False
NUMBER OF MOTORIST INJURED       False
NUMBER OF MOTORIST KILLED        False
CONTRIBUTING FACTOR VEHICLE 1     True
CONTRIBUTING FACTOR VEHICLE 2     True
CONTRIBUTING FACTOR VEHICLE 3     True
CONTRIBUTING FACTOR VEHICLE 4     True
CONTRIBUTING FACTOR VEHICLE 5     True
COLLISION_ID                     False
VEHICLE TYPE CODE 1               True
VEHICLE TYPE CODE 2      

In [9]:
def injured_to_binary(x):
        if x>= 1:
            return 1
        else:
            return 0
data['affected']=data['NUMBER OF PERSONS INJURED']+data['NUMBER OF PERSONS KILLED']
data['affected'] = data['affected'].apply(injured_to_binary)


In [11]:
logreg = LogisticRegression(C=1e9)
valuablecols=['NUMTIME','BOROUGH']
X = data[valuablecols]
y = data['affected']
pred = np.array(X)
response = np.array(y)
pred = pred.reshape(-1,1)
response = response.reshape(-1,1)
logreg.fit(X, y)
outcome_pred_class_log = logreg.predict(X)
print("The slope is: ",logreg.coef_)
print("The y-intercept is",logreg.intercept_)
print(metrics.r2_score(y,outcome_pred_class_log))



The slope is:  [[ 0.01370695 -0.03968007]]
The y-intercept is [-1.53431646]
-0.23732710189587514


In [12]:
feature_cols = data.columns[10]
resp_col = data.columns[11]
x = data[feature_cols].copy()
y = data[resp_col].copy()

In [13]:
x=x>0
y=y>0
x = x.dropna()
y = y.dropna()
pred = np.array(x)
response = np.array(y)
pred = pred.reshape(-1,1)
response = response.reshape(-1,1)
print(pred.shape)
print(response.shape)

(687819, 1)
(687819, 1)


In [14]:
linreg = LinearRegression()
linreg.fit(pred,response)
end = linreg.predict(pred)
print("The slope is: ",linreg.coef_)
print("The y-intercept is",linreg.intercept_)
print(metrics.r2_score(y,end))
print(metrics.mean_squared_error(y, end))

The slope is:  [[0.00014008]]
The y-intercept is [0.00090226]
3.2677256224245e-06
0.0009281573609632766


In [15]:
print(data["CONTRIBUTING FACTOR VEHICLE 1"].value_counts())
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='1'].index)
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='Listening/Using Headphones'].index)
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='Cell Phone (hand-held)'].index)
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='Windshield Inadequate'].index)
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='Shoulders Defective/Improper'].index)
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='Texting'].index)
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='Headlights Defective'].index)
data = data.drop(data[data['CONTRIBUTING FACTOR VEHICLE 1']=='Using On Board Navigation Device'].index)
print(data["CONTRIBUTING FACTOR VEHICLE 1"].value_counts())

Unspecified                       196771
Driver Inattention/Distraction    154867
Failure to Yield Right-of-Way      49617
Backing Unsafely                   36709
Following Too Closely              36607
                                   ...  
Windshield Inadequate                 20
Texting                               17
Cell Phone (hand-held)                17
Listening/Using Headphones            10
1                                      8
Name: CONTRIBUTING FACTOR VEHICLE 1, Length: 61, dtype: int64
Unspecified                                              196771
Driver Inattention/Distraction                           154867
Failure to Yield Right-of-Way                             49617
Backing Unsafely                                          36709
Following Too Closely                                     36607
Passing Too Closely                                       26692
Passing or Lane Usage Improper                            24910
Other Vehicular                        

In [16]:
print(print(data["CONTRIBUTING FACTOR VEHICLE 2"].value_counts()))
value_counts = data["CONTRIBUTING FACTOR VEHICLE 2"].value_counts(ascending=True)
remove = value_counts[value_counts <= 20].index
data = data[~data["CONTRIBUTING FACTOR VEHICLE 2"].isin(remove)]
print(print(data["CONTRIBUTING FACTOR VEHICLE 2"].value_counts()))

Unspecified                       478989
Driver Inattention/Distraction     37042
Other Vehicular                    11362
Failure to Yield Right-of-Way       6511
Following Too Closely               5961
                                   ...  
Shoulders Defective/Improper           3
Vehicle Vandalism                      3
Cell Phone (hand-held)                 3
Texting                                2
1                                      2
Name: CONTRIBUTING FACTOR VEHICLE 2, Length: 61, dtype: int64
None
Unspecified                                              478989
Driver Inattention/Distraction                            37042
Other Vehicular                                           11362
Failure to Yield Right-of-Way                              6511
Following Too Closely                                      5961
Passing or Lane Usage Improper                             5899
Passing Too Closely                                        4733
Backing Unsafely                  

In [17]:
print(print(data["CONTRIBUTING FACTOR VEHICLE 3"].value_counts()))


Unspecified                                              36038
Other Vehicular                                            701
Driver Inattention/Distraction                             358
Following Too Closely                                      247
Fatigued/Drowsy                                             56
Passing or Lane Usage Improper                              54
Pavement Slippery                                           45
Driver Inexperience                                         35
Backing Unsafely                                            29
Alcohol Involvement                                         28
Traffic Control Disregarded                                 24
Reaction to Uninvolved Vehicle                              22
Failure to Yield Right-of-Way                               22
Unsafe Speed                                                20
Passing Too Closely                                         20
Turning Improperly                                     

In [18]:
print(print(data["CONTRIBUTING FACTOR VEHICLE 4"].value_counts()))


Unspecified                       8177
Other Vehicular                    148
Driver Inattention/Distraction      24
Following Too Closely               19
Fatigued/Drowsy                     10
Pavement Slippery                    8
Alcohol Involvement                  5
Traffic Control Disregarded          5
Unsafe Speed                         5
Reaction to Uninvolved Vehicle       5
Passing or Lane Usage Improper       4
Backing Unsafely                     3
Failure to Yield Right-of-Way        3
Passing Too Closely                  2
Fell Asleep                          2
Aggressive Driving/Road Rage         2
Drugs (illegal)                      2
Brakes Defective                     2
Outside Car Distraction              1
Turning Improperly                   1
Other Electronic Device              1
Driver Inexperience                  1
Failure to Keep Right                1
Drugs (Illegal)                      1
Driverless/Runaway Vehicle           1
Animals Action           