In [1]:
#Importing all the libraries

import pandas as pd

import numpy as np

import sklearn

from sklearn.metrics import accuracy_score

from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

from sklearn import model_selection

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import LinearSVC

from sklearn import svm

from xgboost import XGBClassifier


import matplotlib.pyplot as plt





In [None]:
#Loading the preprocessed data for Casualities

df = pd.read_csv("../data/data.csv")


In [None]:
# df = df.drop(columns = 'Casualty_IMD_Decile')

In [None]:
df #checking what this dataframe df looks like

In [None]:
df.describe() #Studying about the df

In [None]:
df.info() #Studying about the df

In [None]:
X = df.loc[:, df.columns != 'Casualty_Severity'] 
#Casualty_Severity is the target column. We don't want to include this in the training data

X = X.loc[:, X.columns != 'Accident_Index']
#Accident_Index is key of the accident dataset. This is not a valid feature in training/testing data.

X = X.loc[:, X.columns != 'Casualty_Reference']
#Casualty_Reference is key of the casuality dataset. This is not a valid feature in training/testing data.


y = df['Casualty_Severity']

In [None]:
X #X dataset

In [None]:
y #Target column

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the dataset X and y into X_train, X_tests, and y_train, y_tests correspondingly.
# Split ratio of 80:20% is used and random_state is used to produce the same datapoints every time (reproducible).

# Model 1: logistic regression - accuracy 82%

In [None]:
# grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge

# #Defining the LogisticRegression model. fit_intercept is the bias added the decision function.
# model = GridSearchCV(LogisticRegression(),grid,cv=10)

# # Fitting the training (X and y) data into the model. The model learns the data now.
# model.fit(X_train,y_train)


In [None]:
model = LogisticRegression(C=1, fit_intercept=True, intercept_scaling=1)
model.fit(X_train, y_train)

#When C = 0.1, accuracy 82.8%
#When C = 0.01, accuracy 82.8%
#When C = 1, accuracy 82.8%

In [None]:
# Now that the model learnt the data, we will try to predict y_pred for X_test

y_pred = model.predict(X_test)

In [None]:
# This is how the predicted data looks like\

# print("tuned hpyerparameters :(best parameters) ",model.best_params_)
# print("accuracy :",model.best_score_)

print('Minimum of the predictions is ' + str(np.amin(y_pred)))
print('Maximum of the predictions is ' + str(np.amax(y_pred)))

y_pred

In [None]:
# We got 82.8% accuracy of the predictions

accuracy_score(y_test, y_pred)

In [None]:
y_pred.shape

# y_test.shape

In [None]:
# Macro F1 score

f1_score(y_test, y_pred, average='macro')

In [None]:
# Micro F1 score

f1_score(y_test, y_pred, average='micro')

In [None]:
# Weighted F1 score

f1_score(y_test, y_pred, average='weighted')

In [None]:
# Average F1 score

f1_score(y_test, y_pred, average=None)

In [None]:
print('\nLogistic Regression Accuracy: ' + str(accuracy_score(y_test, y_pred)*100))
print('Logistic Regression Classification report:\n', classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

# Model 2: K-Fold Cross Validation

In [None]:
num_instances = len(X)
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

# Model 3: Random Forests - accuracy 81.89%

In [None]:
# random forest model creation
model = RandomForestClassifier()
model.fit(X_train,y_train)

# predictions
y_pred = model.predict(X_test)


In [None]:
print('\nRandom Forest Accuracy: ' + str(accuracy_score(y_test, model)*100))
print('Random Forest Classification report:\n', classification_report(y_test, model))

# Method 4: Polynomial Kernel SVM, Accuracy = 82.82

In [None]:
model = Pipeline([
        ("poly_features", PolynomialFeatures(degree=3)),
        ("scaler", MinMaxScaler()),
        ("svm_clf", LinearSVC(C=1, loss="hinge", random_state=42))
    ])


# y_trainRaveled=y_train.ravel()
model.fit(X_train, y_train)

#y_test
y_pred = model.predict(X_test)


#Confusion Matrix
print('Polynomial Regression Accuracy: ' + str(accuracy_score(y_test, y_pred)*100))
print('Polynomial Regression Classification report:\n', classification_report(y_test, y_pred))

results = confusion_matrix(y_test, y_pred) 

print ("Confusion Matrix :")
print(results) 

# Method 5: Using svm.SVC kernel='linear', accuracy = 82.82%

In [None]:
# 1. Create an svm Classifier
model = svmClassifier2 = Pipeline([
    ("scaler", StandardScaler()), ("linear_svc", LinearSVC(C=0.1, loss='hinge',random_state=42))
                          ])

#2. Train the model using the training sets - fit the model - with training data
model.fit(X_train, y_train)

#3. Predict the response for test dataset - predict using the trained model for test data
y_pred = model.predict(X_test)

print('Linear SVM Accuracy: ' + str(accuracy_score(y_test, y_pred)*100))
print('Linear SVM Classification report:\n', classification_report(y_test, y_pred))

results = confusion_matrix(y_test, y_pred) 

print ("Confusion Matrix :")
print(results) 

References:
    
    'micro':
Calculate metrics globally by counting the total true positives, false negatives and false positives.

'macro':
Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

'weighted':
Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.



In [None]:
df

References

1. https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

# Time-Series Models

In [None]:
#Loading the preprocessed data for Casualities
# We observe that the casualities data has timestamp in the accident_index. Using this, we want to do time series.

df = pd.read_csv("../data/data.csv")


In [None]:
df

In [None]:
df['Accident_Index']

In [None]:
df['Year'] = df['Accident_Index'].astype(str)

df['Year']

In [None]:
df['Year'] = df['Year'].str[0:4]

df['Year']

In [None]:
df.describe()

In [None]:
df.groupby('Year').count()

#We could observe that the number of accidents in 2019 was the lowest of all the data that we have.

# 2019 - 144552

# 2018 - 151955

# 2017 - 162341

In [None]:
# x axis values 
x = [2017,2018,2019] 
# corresponding y axis values 
y = [162341,151955,144552] 
  
# plotting the points  
plt.plot(x, y) 
  
# naming the x axis 
plt.xlabel('Year') 
# naming the y axis 
plt.ylabel('Number of Accidents That Happened') 
  
# giving a title to my graph 
plt.title('Accidents in a Year') 
  
# function to show the plot 
plt.show() 