# Notebook for the Capstone Project

In [1]:
#Import Libraries
import pandas as pd 
import numpy as np
from sklearn.utils import resample

print('Hello Capstone Project Course')

Hello Capstone Project Course


# Introduction/Business Problem 

- Business Problem: 
    - Accidents continue to block traffic and cause backups in the greater Seattle area.  In order to help alleviate the congestion during peak hours of travel we need to know what causes them in the first place.  


- Hypothesis/Thoughts: 
    - The date/time of day effects the amount of accidents that will occur
        - Independent Variable: Date/Time
    - The severity of an accident is determened by location and time
        - Independent Variable: Locaiton & Time
    
    
    New Hypothesis after extensive data exploration: Road Conditions are the influence in the severity of the accident when all things are equal; i.e. not considering drugs/alcohol etc. 


- Data Background: 
    - SDOT (Seattle Department of Transportation) Data 
    - Removed 5,639 of the 194,673 rows in the initial data set that were noted NEI (Not enough Information or Insufficient Location Information)
    - Data can be found via GitHub URL with the metadata descriptions.  
    - Date Time cannot be used to predict severity due to the inconsistency of the data entered
    - Severity Code & Severity Code Definitions:
        - 0: Little to no Probability (Clear Conditions)
        - 1: Very Low Probability — Chance or Property Damage
        - 2: Low Probability — Chance of Injury
        - 3: Mild Probability — Chance of Serious Injury
        - 4: High Probability — Chance of Fatality

In [None]:
data = pd.read_csv('SDOT_Collisions.csv')

In [None]:
data.describe()

In [None]:
data.head()

#### Check Unique Values

In [None]:
data['SEVERITYCODE'].value_counts()

In [None]:
data['WEATHER'].value_counts()

In [None]:
data['ROADCOND'].value_counts()

In [None]:
data['LIGHTCOND'].value_counts()

In [None]:
data['SPEEDING'].value_counts()

#### Evening the sample size out 

In [None]:
df_major = data[data.SEVERITYCODE == 1]
df_minor = data[data.SEVERITYCODE == 2]

df_smpl = resample(df_major, replace=False, n_samples = 19576, random_state=123)
df = pd.concat([df_smpl,df_minor])

df['SEVERITYCODE'].value_counts()

#### Converting fields to category then labeling  

In [None]:
df = df.astype({"WEATHER":'category', "ROADCOND":'category', "LIGHTCOND":'category'})
df.head()

In [None]:
df["WEATHER_c"] = df["WEATHER"].cat.codes
df["ROADCOND_c"] = df["ROADCOND"].cat.codes
df["LIGHTCOND_c"] = df["LIGHTCOND"].cat.codes
Feature = df[['WEATHER','ROADCOND','LIGHTCOND','WEATHER_c','ROADCOND_c','LIGHTCOND_c']]
X = np.asarray(Feature[['WEATHER_c','ROADCOND_c','LIGHTCOND_c']])

In [None]:
Feature.head()

In [None]:
y = df['SEVERITYCODE'].values
y[0:]

### Building The Models

In [None]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print('Test set shape: ', X_test.shape, y_test.shape)
print('Training set shape: ', X_train.shape, y_train.shape)

#### K Nearest Neighbor(KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k = 20
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh_pred = neigh.predict(X_test)
neigh_pred[0:]

In [None]:
from sklearn.metrics import f1_score, jaccard_similarity_score, log_loss
print('KNN F1-Score: ', f1_score(y_test, neigh_pred, average='macro'))
print('KNN Jaccard Score: ', jaccard_similarity_score(y_test, neigh_pred))

#### Support Vector Machine (SVM) 

In [None]:
from sklearn import svm
y_train = y_train.astype(float)
s_svm = svm.LinearSVC(random_state=7)
s_svm.fit(X_train, y_train)  

y_pred=s_svm.predict(X_train)

In [None]:
print('SVM F1-Score is: ',f1_score(y_train, y_pred, average='weighted'))
print('SVM Jaccard Score is: ',jaccard_similarity_score(y_train, y_pred))

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy',max_depth = 6)
tree.fit(X_train, y_train)
ptree = tree.predict(X_test)
ptree[0:]

In [None]:
print('Decision Tree F1-Score: ', f1_score(y_test, ptree, average='macro'))
print('Decision Tree Jaccard Score: ', jaccard_similarity_score(y_test,ptree))

#### Log Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train, y_train)
LRPe = LR.predict(X_test)
LRP = LR.predict_proba(X_test)
LRPe[0:]

In [None]:
print('Logistic Regression F1-Score is: ', f1_score(y_test, LRPe, average='macro'))
print('Logistic Regression Jaccard Score is: ', jaccard_similarity_score(y_test, LRPe))
print('Logistic Regression LogLoss is: ', log_loss(y_test, LRP))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cnf_matrix = confusion_matrix(y_test, LRPe, labels=[1,2])
np.set_printoptions(precision=2)

# Non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['SEVERITY=1','SEVERITY=2'],normalize= False,  title='Confusion matrix')

In [None]:
print(classification_report(y_test, LRPe))

In [None]:
# F1-Scores
print('KNN F1-Score: ', f1_score(y_test, neigh_pred, average='macro'))
print('Decision Tree F1-Score: ', f1_score(y_test, ptree, average='macro'))
print('Logistic Regression F1-Score is: ', f1_score(y_test, LRPe, average='macro'))
print('SVM F1-Score is: ',f1_score(y_train, y_pred, average='weighted'))

In [None]:
# Jaccard Scores 
print('KNN Jaccard Score: ', jaccard_similarity_score(y_test, neigh_pred))
print('Decision Tree Jaccard Score: ', jaccard_similarity_score(y_test,ptree))
print('Logistic Regression Jaccard Score is: ', jaccard_similarity_score(y_test, LRPe))
print('SVM Jaccard Score is: ',jaccard_similarity_score(y_train, y_pred),'\n')
# Log Loss Score
print('Logistic Regression LogLoss is: ', log_loss(y_test, LRP))

#### Based off of the testing done, the decision tree is the best model to test the data with.  

### Findings - After testing the historical data provided by Seattle Departement of Transportation, we were not able to find if the time of day played a factor in the severity of an accident due to the data that was entered.  However, we did find that weather, light and road conditions have an impact on the severity of an accident; either class 1 - Property Damage or class 2 - Injury.  