In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 
from sklearn.model_selection import train_test_split
import pickle

In [44]:

df  = pd.read_csv('~/PycharmProjects/car_prediction/monroe_county_car_crash_2003-2015.csv',encoding='latin-1')

In [45]:
df.head()

Unnamed: 0,Year,Month,Day,Weekend?,Hour,Collision Type,Injury Type,Primary Factor,Reported_Location,Latitude,Longitude
0,2015,1,5,Weekday,0.0,2-Car,No injury/unknown,OTHER (DRIVER) - EXPLAIN IN NARRATIVE,1ST & FESS,39.159207,-86.525874
1,2015,1,6,Weekday,1500.0,2-Car,No injury/unknown,FOLLOWING TOO CLOSELY,2ND & COLLEGE,39.16144,-86.534848
2,2015,1,6,Weekend,2300.0,2-Car,Non-incapacitating,DISREGARD SIGNAL/REG SIGN,BASSWOOD & BLOOMFIELD,39.14978,-86.56889
3,2015,1,7,Weekend,900.0,2-Car,Non-incapacitating,FAILURE TO YIELD RIGHT OF WAY,GATES & JACOBS,39.165655,-86.575956
4,2015,1,7,Weekend,1100.0,2-Car,No injury/unknown,FAILURE TO YIELD RIGHT OF WAY,W 3RD,39.164848,-86.579625


In [7]:
df.columns

Index(['Year', 'Month', 'Day', 'Weekend?', 'Hour', 'Collision Type',
       'Injury Type', 'Primary Factor', 'Reported_Location', 'Latitude',
       'Longitude'],
      dtype='object')

In [46]:
missing_values = df.isnull().sum()


print("Columns with missing values:")
print(missing_values[missing_values > 0])

Columns with missing values:
Weekend?               68
Hour                  225
Collision Type          6
Primary Factor       1121
Reported_Location      35
Latitude               30
Longitude              30
dtype: int64


In [47]:

df['Hour'].fillna(df['Hour'].mean(), inplace=True)
df['Latitude'].fillna(df['Latitude'].mean(), inplace=True)
df['Longitude'].fillna(df['Longitude'].mean(), inplace=True)


In [48]:

df['Weekend?'].fillna(df['Weekend?'].mode()[0], inplace=True)
df['Collision Type'].fillna(df['Collision Type'].mode()[0], inplace=True)
df['Primary Factor'].fillna(df['Primary Factor'].mode()[0], inplace=True)
df['Reported_Location'].fillna(df['Reported_Location'].mode()[0], inplace=True)

In [49]:
missing_values = df.isnull().sum()

print("Columns with missing values:")
print(missing_values[missing_values > 0])

Columns with missing values:
Series([], dtype: int64)


In [50]:

X = df[['Year', 'Month', 'Day', 'Hour', 'Collision Type', 'Primary Factor']]
y = df['Injury Type']  #'Injury Type' is the target variable

In [51]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
from sklearn.preprocessing import LabelEncoder
collision_type_encoder = LabelEncoder()
primary_factor_encoder = LabelEncoder()

df['Collision_Type_encoded'] = collision_type_encoder.fit_transform(df['Collision Type'])
df['Primary_Factor_encoded'] = primary_factor_encoder.fit_transform(df['Primary Factor'])

df.drop(columns=['Collision Type', 'Primary Factor'], inplace=True)

In [16]:
df.columns

Index(['Year', 'Month', 'Day', 'Weekend?', 'Hour', 'Injury Type',
       'Reported_Location', 'Latitude', 'Longitude', 'Collision_Type_encoded',
       'Primary_Factor_encoded'],
      dtype='object')

In [17]:
df.head()

Unnamed: 0,Year,Month,Day,Weekend?,Hour,Injury Type,Reported_Location,Latitude,Longitude,Collision_Type_encoded,Primary_Factor_encoded
0,2015,1,5,Weekday,0.0,No injury/unknown,1ST & FESS,39.159207,-86.525874,1,27
1,2015,1,6,Weekday,1500.0,No injury/unknown,2ND & COLLEGE,39.16144,-86.534848,1,11
2,2015,1,6,Weekend,2300.0,Non-incapacitating,BASSWOOD & BLOOMFIELD,39.14978,-86.56889,1,5
3,2015,1,7,Weekend,900.0,Non-incapacitating,GATES & JACOBS,39.165655,-86.575956,1,10
4,2015,1,7,Weekend,1100.0,No injury/unknown,W 3RD,39.164848,-86.579625,1,10


In [57]:

injury_type_encoder = LabelEncoder()
df['Injury_Type_encoded'] = injury_type_encoder.fit_transform(df['Injury Type'])

df.drop(columns=['Injury Type', 'Reported_Location'], inplace=True)

In [59]:

with open('collision_type_encoder.pkl', 'wb') as f:
    pickle.dump(collision_type_encoder, f)

with open('primary_factor_encoder.pkl', 'wb') as f:
    pickle.dump(primary_factor_encoder, f)
    
with open('injury_type_encoder.pkl', 'wb') as f:
    pickle.dump(injury_type_encoder, f)

In [58]:
df.head()

Unnamed: 0,Year,Month,Day,Weekend?,Hour,Latitude,Longitude,Collision_Type_encoded,Primary_Factor_encoded,Injury_Type_encoded
0,2015,1,5,Weekday,0.0,39.159207,-86.525874,1,27,2
1,2015,1,6,Weekday,1500.0,39.16144,-86.534848,1,11,2
2,2015,1,6,Weekend,2300.0,39.14978,-86.56889,1,5,3
3,2015,1,7,Weekend,900.0,39.165655,-86.575956,1,10,3
4,2015,1,7,Weekend,1100.0,39.164848,-86.579625,1,10,2


In [20]:

df['Weekend_encoded'] = df['Weekend?'].map({'Weekday': 0, 'Weekend': 1})

df.drop(columns=['Weekend?'], inplace=True)

In [21]:
df.head(20)

Unnamed: 0,Year,Month,Day,Hour,Latitude,Longitude,Collision_Type_encoded,Primary_Factor_encoded,Injury_Type_encoded,Reported_Location_encoded,Weekend_encoded
0,2015,1,5,0.0,39.159207,-86.525874,1,27,2,1301,0
1,2015,1,6,1500.0,39.16144,-86.534848,1,11,2,1612,0
2,2015,1,6,2300.0,39.14978,-86.56889,1,5,3,4438,1
3,2015,1,7,900.0,39.165655,-86.575956,1,10,3,10505,1
4,2015,1,7,1100.0,39.164848,-86.579625,1,10,2,18788,1
5,2015,1,6,1800.0,39.12667,-86.53137,1,10,2,5261,0
6,2015,1,6,1200.0,39.150825,-86.584899,1,7,2,17346,0
7,2015,1,6,1400.0,39.199272,-86.637024,0,9,1,14615,0
8,2015,1,7,1400.0,39.16461,-86.57913,1,11,2,12722,1
9,2015,1,7,1600.0,39.16344,-86.55128,0,38,2,15234,1


In [22]:
df.drop(columns=['Reported_Location_encoded', 'Latitude', 'Longitude'], inplace=True)

In [23]:
X = df.drop(columns=['Injury_Type_encoded']) 
y = df['Injury_Type_encoded'] 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [24]:

param_grid = {
    'C': [0.1, 1, 10], 
    'kernel': ['linear', 'rbf'], 
}

In [27]:

df_sampled = df.sample(frac=0.5, random_state=42) 


X_sampled = df_sampled.drop(columns=['Injury_Type_encoded'])
y_sampled = df_sampled['Injury_Type_encoded']

X_sampled_train, X_sampled_test, y_sampled_train, y_sampled_test = train_test_split(
    X_sampled, y_sampled, test_size=0.2, random_state=42)


In [30]:
!pip install xgboost
import xgboost as xgb



In [31]:
df['Injury_Type_encoded'].unique()

array([2, 3, 1, 0])

In [32]:

X = df.drop(columns=['Injury_Type_encoded']) 
y = df['Injury_Type_encoded']  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


params = {
    'objective': 'multi:softmax',  
    'num_class': 4,                
    'eval_metric': 'merror',      
    'eta': 0.1,                   
    'max_depth': 6,               
    'min_child_weight': 1,         
    'subsample': 0.8,             
    'colsample_bytree': 0.8,       
    'gamma': 0,                   
    'seed': 42                   
}


dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


num_round = 100  
bst = xgb.train(params, dtrain, num_round)


y_pred = bst.predict(dtest)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7988692186486236


In [33]:
from sklearn.metrics import classification_report


report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.48      0.06      0.10       225
           2       0.81      0.99      0.89      8348
           3       0.69      0.16      0.27      2191

    accuracy                           0.80     10789
   macro avg       0.49      0.30      0.31     10789
weighted avg       0.77      0.80      0.74     10789


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'max_depth': [1, 2, 3],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'eta': [0.01, 0.1, 0.3]
}


xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, seed=42)


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=10)


grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Fitting 3 folds for each of 729 candidates, totalling 2187 fits
[CV 1/3; 1/729] START colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1, min_child_weight=1, subsample=0.6
[CV 1/3; 1/729] END colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1, min_child_weight=1, subsample=0.6;, score=0.790 total time=   0.2s
[CV 2/3; 1/729] START colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1, min_child_weight=1, subsample=0.6
[CV 2/3; 1/729] END colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1, min_child_weight=1, subsample=0.6;, score=0.790 total time=   0.2s
[CV 3/3; 1/729] START colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1, min_child_weight=1, subsample=0.6
[CV 3/3; 1/729] END colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1, min_child_weight=1, subsample=0.6;, score=0.790 total time=   0.2s
[CV 1/3; 2/729] START colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1, min_child_weight=1, subsample=0.8
[CV 1/3; 2/729] END colsample_bytree=0.6, eta=0.01, gamma=0, max_depth=1,

In [35]:
data = df.head(1)
data['Year'] = 2016
data['Hour'] = 12.0
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Year'] = 2016
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Hour'] = 12.0


Unnamed: 0,Year,Month,Day,Hour,Collision_Type_encoded,Primary_Factor_encoded,Injury_Type_encoded,Weekend_encoded
0,2016,1,5,12.0,1,27,2,0


In [36]:
data.drop(columns=['Injury_Type_encoded'], inplace=True)
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=['Injury_Type_encoded'], inplace=True)


Unnamed: 0,Year,Month,Day,Hour,Collision_Type_encoded,Primary_Factor_encoded,Weekend_encoded
0,2016,1,5,12.0,1,27,0


In [37]:
x = best_model.predict(data)
print(x)

[2]


In [39]:
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [40]:
with open('model.pkl', 'rb') as f:
    pickle.load(f)

In [43]:
df.value_counts()

Year  Month  Day  Hour    Collision_Type_encoded  Primary_Factor_encoded  Injury_Type_encoded  Weekend_encoded
2003  7      3    1200.0  1                       10                      2                    0                  8
2015  9      3    1600.0  1                       11                      2                    0                  6
2014  10     4    1200.0  1                       10                      2                    0                  6
2012  1      5    1600.0  1                       11                      2                    0                  6
      5      6    1500.0  1                       11                      2                    0                  6
                                                                                                                 ..
2007  7      3    1300.0  1                       27                      2                    0                  1
                          2                       11                      3  