In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime as dt
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
data_path = r'data/clean_df.csv.gz'
df_import = pd.read_csv(data_path, parse_dates=True)# index_col=0)
df_import.head()

Unnamed: 0.1,Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,...,DISTANCE FROM MADISON SQUARE GARDEN,DISTANCE FROM PORT AUTHORITY,DISTANCE FROM GRAND ARMY PLAZA,DISTANCE FROM TRIBECA,DISTANCE FROM BATTERY PARK,DISTANCE FROM SOHO,DISTANCE FROM CONEY ISLAND,DAYS FROM NEW YEARS,DAYS FROM CHRISTMAS,DAYS FROM JULY 4
0,0,2017-06-26 00:00:00,2020-09-22 22:00:00,MANHATTAN,10002.0,40.71827,-73.99072,POINT (-73.99072 40.71827),ALLEN STREET,BROOME STREET,...,0.032134,0.03888,0.051023,0.017988,0.029482,0.01327,0.144167,176 days,-182 days,-8 days
1,1,2017-07-14 00:00:00,2020-09-22 13:17:00,,10028.0,40.78318,-73.94362,POINT (-73.94362 40.78318),FDR DRIVE,,...,0.059596,0.054004,0.113396,0.093249,0.107943,0.08433,0.209438,194 days,-164 days,10 days
2,2,2017-07-04 00:00:00,2020-09-22 14:00:00,BRONX,10462.0,40.833557,-73.85774,POINT (-73.85774 40.833557),WESTCHESTER AVENUE,PUGSLEY AVENUE,...,0.159107,0.153556,0.195316,0.191071,0.205098,0.182365,0.281697,184 days,-174 days,0 days
3,3,2017-07-17 00:00:00,2020-09-22 21:03:00,MANHATTAN,10032.0,40.833843,-73.94851,POINT (-73.94851 40.833843),RIVERSIDE DRIVE,WEST 155 STREET,...,0.094805,0.087647,0.162559,0.132012,0.147135,0.123243,0.259294,197 days,-161 days,13 days
4,4,2017-06-26 00:00:00,2020-09-22 15:35:00,BROOKLYN,11203.0,40.654434,-73.92139,POINT (-73.92139 40.654434),REMSEN AVENUE,LINDEN BOULEVARD,...,0.119852,0.124044,0.050176,0.106925,0.106393,0.106784,0.09307,176 days,-182 days,-8 days


In [3]:
df = df_import.copy()

In [4]:
df['CASUALTIES?'] = 0
mask = df['TOTAL PEDESTRIAN CASUALTIES'] != 0
df.loc[mask, 'CASUALTIES?'] = 1
df.loc[df['TOTAL PEDESTRIAN CASUALTIES'] != 1, ['TOTAL PEDESTRIAN CASUALTIES','CASUALTIES?']].sample(5)

Unnamed: 0,TOTAL PEDESTRIAN CASUALTIES,CASUALTIES?
857565,0,0
884961,0,0
654363,0,0
1049000,0,0
1477999,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1486656 entries, 0 to 1486655
Data columns (total 56 columns):
 #   Column                               Non-Null Count    Dtype  
---  ------                               --------------    -----  
 0   Unnamed: 0                           1486656 non-null  int64  
 1   CRASH DATE                           1486656 non-null  object 
 2   CRASH TIME                           1486656 non-null  object 
 3   BOROUGH                              1143388 non-null  object 
 4   ZIP CODE                             1486656 non-null  float64
 5   LATITUDE                             1486656 non-null  float64
 6   LONGITUDE                            1486656 non-null  float64
 7   LOCATION                             1486656 non-null  object 
 8   ON STREET NAME                       1203377 non-null  object 
 9   CROSS STREET NAME                    980930 non-null   object 
 10  OFF STREET NAME                      221574 non-null   object 
 11

In [6]:
def dummy_pipeline(cat_feature):
    global df
    global feature_names
    if 'NUMBER' in cat_feature:
        dummies = pd.get_dummies(df[cat_feature], prefix=cat_feature)
    else:
        dummies = pd.get_dummies(df[cat_feature])
    df = df.join(dummies)
    feature_names.extend(list(dummies.columns))

def log_reg_pipeline(feature_names):
    
    X = df[feature_names]
    y = df['CASUALTIES?']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
    
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    y_train_pred = log_reg.predict(X_train)
    y_test_pred = log_reg.predict(X_test)
    
    return log_reg, f1_score(y_train, y_train_pred), f1_score(y_test, y_test_pred)

### Logistic regression feature development

Notes: Create a table. Include model, training/testing error and list of feature names.

In [7]:
feature_names = ['LATITUDE']
print(feature_names)

['LATITUDE']


In [8]:
log_reg_1, train_1, test_1 = log_reg_pipeline(feature_names)
print(train_1, test_1)

0.0 0.0


In [9]:
feature_names.append('LONGITUDE')
print(feature_names)

['LATITUDE', 'LONGITUDE']


In [10]:
log_reg_2, train_2, test_2 = log_reg_pipeline(feature_names)
print(train_2, test_2)

0.0 0.0


In [11]:
dummy_pipeline('BOROUGH')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND']


In [12]:
log_reg_3, train_3, test_3 = log_reg_pipeline(feature_names)
print(train_3, test_3)

0.0 0.0


In [13]:
dummy_pipeline('SEASON')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter']


In [14]:
log_reg_4, train_4, test_4 = log_reg_pipeline(feature_names)
print(train_4, test_4)

0.0 0.0


In [15]:
feature_names.append('WEEKEND')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND']


In [16]:
log_reg_5, train_5, test_5 = log_reg_pipeline(feature_names)
print(train_5, test_5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0 0.0


In [17]:
feature_names.append('DISTANCE FROM TIMES SQUARE')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND', 'DISTANCE FROM TIMES SQUARE']


In [18]:
log_reg_6, train_6, test_6 = log_reg_pipeline(feature_names)
print(train_6, test_6)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0 0.0


In [19]:
feature_names.append('DISTANCE FROM CONEY ISLAND')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND', 'DISTANCE FROM TIMES SQUARE', 'DISTANCE FROM CONEY ISLAND']


In [20]:
log_reg_7, train_7, test_7 = log_reg_pipeline(feature_names)
print(train_7, test_7)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0 0.0


In [21]:
feature_names.append('DURING DAYTIME')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND', 'DISTANCE FROM TIMES SQUARE', 'DISTANCE FROM CONEY ISLAND', 'DURING DAYTIME']


In [22]:
log_reg_8, train_8, test_8 = log_reg_pipeline(feature_names)
print(train_8, test_8)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0 0.0


In [23]:
dummy_pipeline('WEEKDAY')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND', 'DISTANCE FROM TIMES SQUARE', 'DISTANCE FROM CONEY ISLAND', 'DURING DAYTIME', 'FRIDAY', 'MONDAY', 'SATURDAY', 'SUNDAY', 'THURSDAY', 'TUESDAY', 'WEDNESDAY']


In [24]:
log_reg_9, train_9, test_9 = log_reg_pipeline(feature_names)
print(train_9, test_9)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0 0.0


In [25]:
dummy_pipeline('CONTRIBUTING FACTOR VEHICLE 1')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND', 'DISTANCE FROM TIMES SQUARE', 'DISTANCE FROM CONEY ISLAND', 'DURING DAYTIME', 'FRIDAY', 'MONDAY', 'SATURDAY', 'SUNDAY', 'THURSDAY', 'TUESDAY', 'WEDNESDAY', '1', '80', 'Accelerator Defective', 'Aggressive Driving/Road Rage', 'Alcohol Involvement', 'Animals Action', 'Backing Unsafely', 'Brakes Defective', 'Cell Phone (hand-Held)', 'Cell Phone (hand-held)', 'Cell Phone (hands-free)', 'Driver Inattention/Distraction', 'Driver Inexperience', 'Driverless/Runaway Vehicle', 'Drugs (Illegal)', 'Drugs (illegal)', 'Eating or Drinking', 'Failure to Keep Right', 'Failure to Yield Right-of-Way', 'Fatigued/Drowsy', 'Fell Asleep', 'Following Too Closely', 'Glare', 'Headlights Defective', 'Illnes', 'Illness', 'Lane Marking Improper/Inadequate', 'Listening/Using Headphones', 'Lost Consciousness', 'Obstruction/Debris', 'Other Electronic Device', 'Other Lighting Defects',

In [26]:
log_reg_10, train_10, test_10 = log_reg_pipeline(feature_names)
print(train_10, test_10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.07442334939076226 0.07331018991219114


In [27]:
dummy_pipeline('NUMBER OF MOTORIST KILLED')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND', 'DISTANCE FROM TIMES SQUARE', 'DISTANCE FROM CONEY ISLAND', 'DURING DAYTIME', 'FRIDAY', 'MONDAY', 'SATURDAY', 'SUNDAY', 'THURSDAY', 'TUESDAY', 'WEDNESDAY', '1', '80', 'Accelerator Defective', 'Aggressive Driving/Road Rage', 'Alcohol Involvement', 'Animals Action', 'Backing Unsafely', 'Brakes Defective', 'Cell Phone (hand-Held)', 'Cell Phone (hand-held)', 'Cell Phone (hands-free)', 'Driver Inattention/Distraction', 'Driver Inexperience', 'Driverless/Runaway Vehicle', 'Drugs (Illegal)', 'Drugs (illegal)', 'Eating or Drinking', 'Failure to Keep Right', 'Failure to Yield Right-of-Way', 'Fatigued/Drowsy', 'Fell Asleep', 'Following Too Closely', 'Glare', 'Headlights Defective', 'Illnes', 'Illness', 'Lane Marking Improper/Inadequate', 'Listening/Using Headphones', 'Lost Consciousness', 'Obstruction/Debris', 'Other Electronic Device', 'Other Lighting Defects',

In [28]:
log_reg_11, train_11, test_11 = log_reg_pipeline(feature_names)
print(train_11, test_11)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.07395385261437537 0.07300115874855156


In [29]:
dummy_pipeline('NUMBER OF MOTORIST INJURED')
print(feature_names)

['LATITUDE', 'LONGITUDE', 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'fall', 'spring', 'summer', 'winter', 'WEEKEND', 'DISTANCE FROM TIMES SQUARE', 'DISTANCE FROM CONEY ISLAND', 'DURING DAYTIME', 'FRIDAY', 'MONDAY', 'SATURDAY', 'SUNDAY', 'THURSDAY', 'TUESDAY', 'WEDNESDAY', '1', '80', 'Accelerator Defective', 'Aggressive Driving/Road Rage', 'Alcohol Involvement', 'Animals Action', 'Backing Unsafely', 'Brakes Defective', 'Cell Phone (hand-Held)', 'Cell Phone (hand-held)', 'Cell Phone (hands-free)', 'Driver Inattention/Distraction', 'Driver Inexperience', 'Driverless/Runaway Vehicle', 'Drugs (Illegal)', 'Drugs (illegal)', 'Eating or Drinking', 'Failure to Keep Right', 'Failure to Yield Right-of-Way', 'Fatigued/Drowsy', 'Fell Asleep', 'Following Too Closely', 'Glare', 'Headlights Defective', 'Illnes', 'Illness', 'Lane Marking Improper/Inadequate', 'Listening/Using Headphones', 'Lost Consciousness', 'Obstruction/Debris', 'Other Electronic Device', 'Other Lighting Defects',

In [30]:
log_reg_12, train_12, test_12 = log_reg_pipeline(feature_names)
print(train_12, test_12)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0816625861699367 0.08141856451558323


### Logistic regression table
| Iteration | Features | F1 (training data) | F1 (test data) | Notes |
| :---: | :--- | :---: | :---: | :---: |
| 1 | <ol><li>LATITUDE</li></ol> | 0.0 | 0.0 | |
| 2 | <ol><li>LATITUDE</li><li>LONGITUDE</li></ol> | 0.0 | 0.0 | |
| 3 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li></ol> | 0.0 | 0.0 | |
| 4 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li></ol> | 0.0 | 0.0 | |
| 5 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li></ol> | 0.0 | 0.0 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |
| 6 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li><li>DISTANCE FROM TIMES SQUARE</li></ol> | 0.0 | 0.0 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |
| 7 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li><li>DISTANCE FROM TIMES SQUARE</li><li>DISTANCE FROM CONEY ISLAND</li></ol> | 0.0 | 0.0 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |
| 8 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li><li>DISTANCE FROM TIMES SQUARE</li><li>DISTANCE FROM CONEY ISLAND</li><li>DURING DAYTIME</li></ol> | 0.0 | 0.0 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |
| 9 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li><li>DISTANCE FROM TIMES SQUARE</li><li>DISTANCE FROM CONEY ISLAND</li><li>DURING DAYTIME</li><li>WEEKDAY</li></ol> | 0.0 | 0.0 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |
| 10 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li><li>DISTANCE FROM TIMES SQUARE</li><li>DISTANCE FROM CONEY ISLAND</li><li>DURING DAYTIME</li><li>WEEKDAY</li><li>CONTRIBUTING FACTOR VEHICLE 1</li></ol> | 0.0744 | 0.0733 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |
| 11 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li><li>DISTANCE FROM TIMES SQUARE</li><li>DISTANCE FROM CONEY ISLAND</li><li>DURING DAYTIME</li><li>WEEKDAY</li><li>CONTRIBUTING FACTOR VEHICLE 1</li><li>NUMBER OF MOTORIST KILLED</li></ol> | 0.0740 | 0.0730 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |
| 12 | <ol><li>LATITUDE</li><li>LONGITUDE</li><li>BOROUGH</li><li>SEASON</li><li>WEEKEND</li><li>DISTANCE FROM TIMES SQUARE</li><li>DISTANCE FROM CONEY ISLAND</li><li>DURING DAYTIME</li><li>WEEKDAY</li><li>CONTRIBUTING FACTOR VEHICLE 1</li><li>NUMBER OF MOTORIST KILLED</li><li>NUMBER OF MOTORIST INJURED</li></ol> | 0.0817 | 0.0814 | STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. |