In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
! pip install --upgrade scikit-learn==0.20.3

In [None]:
new_df = pd.read_csv('cleaned.csv')
new_df.head()

In [None]:
## Environmental Analysis
# Pie Chart Explaining the Distribution of Severity of the Data in the Dataframe
plt.figure(figsize = (12,10))
severity_pie = new_df.groupby('Severity').count()['ID'] ### group by severity in order to do circle map
mylabels = ["1", "2", "3", "4"]
plt.title("Severity of Cases",fontsize=20)
plt.pie(severity_pie, labels = mylabels,autopct='%1.1f%%',textprops={'fontsize': 15},radius=1.2,explode=(1,0,0,0))
plt.legend(mylabels, loc="best")

In [None]:
city_acc = new_df.groupby('City')['ID'].count().reset_index()
city_acc = city_acc.sort_values(by='ID', ascending=False)

In [None]:
city_acc

In [None]:
## Which City and State has got most numebr of accidents.
plt.figure(figsize = (20,10))
plt.title("Accidents in Cities",fontsize=20)
plt.barh(city_acc['City'].iloc[:25], city_acc['ID'].iloc[:25], color = 'yellow')
plt.legend(mylabels, loc="best")

In [None]:
state_acc = new_df.groupby('State')['ID'].count().reset_index()
state_acc = state_acc.sort_values(by='ID', ascending=False)

In [None]:
## Which City and State has got most number of accidents.
plt.figure(figsize = (12,7))
plt.title("Accidents in States",fontsize=20)
plt.bar(state_acc['State'].iloc[:25], state_acc['ID'].iloc[:25], color = 'yellow')
plt.legend(mylabels, loc="best")

In [None]:
type(new_df['Start_Time'])

In [None]:
new_df['Start_Time'] = pd.to_datetime(new_df['Start_Time'])

In [None]:
new_df['Start_Time']

In [None]:
new_df['hour'] = new_df['Start_Time'].dt.hour

In [None]:
new_df['hour']

In [None]:
accident_timing = new_df.groupby('hour')['ID'].count().reset_index()

In [None]:
plt.figure(figsize = (12,10))
plt.bar(accident_timing['hour'], accident_timing['ID'], label = 'Count', width = 0.5,color = 'yellow')
sns.lineplot(accident_timing['hour'], accident_timing['ID'], color = 'black', marker = '.', ms = 11)
plt.ylabel('Casualties')
plt.xlabel('Timing')
plt.xticks(np.arange(24))
plt.title('Summary on Accidents W.R.T Timing.')
for i in range(len(accident_timing['hour'])):
    plt.text(i, accident_timing['ID'][i]*1.01,accident_timing['ID'][i], ha = 'center', va = 'center', fontstyle = 'normal', color = 'red', fontsize = 'x-large')
plt.show()

In [None]:
plt.figure(figsize = (12,10))
sns.displot(new_df, x="Temperature(F)", hue="Severity", palette="Set1", height=5, aspect=2,bins=20,kde=True)
plt.title('\n Reported Temperature with Different Severity \n', size=20)
plt.ylabel('\n Count \n', fontsize=15)
plt.xlabel('\n Temperature(F) \n', fontsize=15)

In [None]:
new_df['month'] = pd.to_datetime(new_df['Start_Time']).dt.month

In [None]:
month_pattern = new_df.groupby('month')['ID'].count().reset_index()
month_pattern

In [None]:
month_pattern_df = month_pattern

In [None]:
plt.figure(figsize = (15,5))
plt.bar(month_pattern_df['month'], month_pattern_df['ID'], color = 'yellow', width = 0.35)
sns.lineplot(month_pattern_df['month'], month_pattern_df['ID'], color= 'blue', label = 'Sum', marker = '.', ms = 9)
xticks = np.arange(13)
x_labels = ['0','January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
plt.xticks(ticks = xticks, labels = x_labels)
plt.title('Relationship Between Month and Accident Rates.')
for i in range(len(month_pattern_df)):
    plt.text(i+1,month_pattern_df['ID'].iloc[i]*1.02,month_pattern_df['ID'].iloc[i], fontstyle = 'normal',color = 'black',ha = 'center' ,verticalalignment='center', fontsize = 'large')
plt.show()

In [None]:
visibility = pd.DataFrame(new_df["Visibility(mi)"].value_counts()).reset_index().rename(columns={"index":"Visibility","Visibility(mi)":"Cases"})
### recreate a panda data frame
plt.figure(figsize=(12,10))
plt.title('\n Cases Reported for Visibility \n', size=20)
plt.ylabel('\n Visibility(mi) \n', fontsize=15)
plt.xlabel('\n Number of Accidents\n', fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=12)
plt.xlim(1000,60000)
a = sns.scatterplot(x=visibility.Cases,y=visibility.Visibility[visibility.Visibility<30],color='red')
plt.show()

In [None]:
precipitation = pd.DataFrame(new_df["Precipitation(in)"].value_counts()).reset_index().rename(columns={"index":"Precipitation","Precipitation(in)":"Cases"})
plt.figure(figsize=(10,8))
plt.title('\n Cases Reported for Precipitation(in) \n', size=20)
plt.ylabel('\n Precipitation(in)) \n', fontsize=15)
plt.xlabel('\n Number of Accidents\n', fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=12)
plt.xlim(0,500)
a = sns.scatterplot(x=precipitation.Cases,y=precipitation.Precipitation[precipitation.Precipitation<5], color = 'red')
plt.show()

In [None]:
year_month = new_df["Start_Time"].groupby(new_df.Start_Time.dt.to_period("M")).agg('count').reset_index(name='Num_Accidents')

In [None]:
palette = sns.color_palette("tab10", 6)
plt.figure(figsize = (10,8))
plt.xlabel("Month",fontsize=15)
plt.ylabel("Number of Accidents",fontsize=15)
plt.title('Number of Accidents of 2016-2021 by Months',size=20)
sns.lineplot(data=year_month, x=year_month.Start_Time.dt.month, y=year_month.Num_Accidents, hue=year_month.Start_Time.dt.year,
             style=year_month.Start_Time.dt.year, palette=palette)

In [None]:
plt.figure(figsize = (10,8))
plt.xlabel("Month",fontsize=15)
plt.ylabel("Temperature level",fontsize=15)
plt.title('Temperature Variation of 2016-2021 by Months',size=20)
sns.lineplot(data=new_df, x=new_df.Start_Time.dt.month, y=new_df["Temperature(F)"], hue=new_df.Start_Time.dt.year,
             style=new_df.Start_Time.dt.year, color = 'red')

In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(x="Start_Lng", y="Start_Lat", hue="State", data=new_df, s=10, legend=False)
plt.xlabel("Longtiude")
plt.ylabel("Latitude")
plt.show()

In [None]:
## Location Analysis

import folium
from folium.plugins import HeatMap
sample_df= new_df.sample(int(0.001*len(new_df)))
lat_lng_pairs= list(zip(list(sample_df.Start_Lat),list(sample_df.Start_Lng)))
map= folium.Map([38.552397, -96],zoom_start=6, min_zoom = 5, max_zoom = 13)
HeatMap(lat_lng_pairs).add_to(map)
map

1. Increase help and resting station for drivers all year round. This is due to most car accidents happened at a normal temperature 50-80F.
2. Provide benefits for people staying at indoors when the weather has a low visbility. This is due to low visbility results in more car accidents than any other environmental factors.
3. Give incentives for people who stay off cars during the COVID-19 period. This is due to the work-from-home mode and yearly environmental factors do not contribute to more car accidents.

In [None]:
list1 =[ 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight']

In [None]:
for i in list1:
  plt.bar(new_df[i].value_counts().keys(), new_df[i].value_counts(), color = 'yellow', width = 0.35)
  plt.title(i)
  plt.show()

In [None]:
## Clustering Example using Strip plot
### sns.regplot(x=df_2['Severity'],y=df_2['Temperature(F)'])
fig, ax = plt.subplots(figsize = (10,5))
plt.title('\n Relationship between Temperature and Severity \n', size=20)
plt.ylabel('\n Temperature(F) \n', fontsize=15)
plt.xlabel('\n Severity \n', fontsize=15)
sns.stripplot(x=new_df['Severity'],y=new_df['Temperature(F)'],data=new_df,jitter=0.2)

In [None]:
## Predictive Analysis

# Our starting and ending latitudes show a positive correlation. This is merely a test to see how our data functions.


plt.style.use('seaborn')

new_df.plot(x = 'Start_Lat', y='End_Lat',kind='scatter')
plt.show()

In [None]:
new_df.info()

In [None]:
new_df.head()

In [None]:
# Dropping some more irrelevant columns.
new_df.drop(['Description', 'Street', 'County', 'ID', 'Timezone', 'Airport_Code', 'Zipcode'], axis = 1, inplace = True)

In [None]:
new_df['End_Time'] = pd.to_datetime(new_df['End_Time'])

In [None]:
new_df['end_hour'] = new_df['End_Time'].dt.hour
new_df['end_month'] = new_df['End_Time'].dt.month

In [None]:
new_df.drop(['Start_Time', 'End_Time'], axis = 1, inplace = True)

In [None]:
# Convert Object type to Int.
new_df.info()

In [None]:
cols = ['Side', 'City', 'State', 'Country', 'Wind_Direction', 'Weather_Condition']
#
# Encode labels of multiple columns at once
#
new_df[cols] = new_df[cols].apply(LabelEncoder().fit_transform)
#
# Print head
#
new_df.head()

In [None]:
cols = ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight','Astronomical_Twilight']
new_df[cols] = new_df[cols].apply(LabelEncoder().fit_transform)
#
# Print head
#
new_df.head()

In [None]:
cols = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']

new_df[cols] = new_df[cols].apply(LabelEncoder().fit_transform)
#
# Print head
#
new_df.head()

In [None]:
# Sperating the taregt and independent features
X = new_df.drop(['Severity'], axis = 1)
y = new_df['Severity']

In [None]:
# Split the dataset into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")


In [None]:
# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

In [None]:
X_train = X_train[:8000]
y_train = y_train[:8000]
X_test = X_test[:5000]
y_test = y_test[:5000]


In [None]:
from sklearn.neighbors import KNeighborsClassifier


neig = KNeighborsClassifier(n_neighbors=4)
neig.fit(X_train, y_train)

print_score(neig, X_train, y_train, X_test, y_test, train=True)
print_score(neig, X_train, y_train, X_test, y_test, train=False)

In [None]:
var_df = pd.read_csv('/content/drive/MyDrive/Road_Transportation/cleaned.csv')

In [None]:
var_df.info()

In [None]:
var_df['month'] = pd.to_datetime(var_df['Start_Time']).dt.month

In [None]:
var_df['hour'] = pd.to_datetime(var_df['Start_Time']).dt.hour

In [None]:
var_df['end_hour'] = pd.to_datetime(var_df['End_Time']).dt.hour

In [None]:
var_df['end_month'] = pd.to_datetime(var_df['End_Time']).dt.month

In [None]:
var_df.drop(['ID', 'Start_Time', 'End_Time'], axis = 1, inplace = True)

In [None]:
var_df.info()

In [None]:
cols = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 
        'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']

var_df[cols] = var_df[cols].apply(LabelEncoder().fit_transform)


In [None]:
cols = ['Description','Street', 'County', 'Side', 'City', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Wind_Direction', 'Weather_Condition']


var_df[cols] = var_df[cols].apply(LabelEncoder().fit_transform)


In [None]:
var_df.info()

In [None]:
X = var_df.drop(['Severity'], axis = 1)
y = var_df['Severity']

In [None]:
import mlxtend
import joblib
from sklearn.tree import DecisionTreeClassifier as dt

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
sfal = SFS(dt(max_depth=5), k_features='best', forward=True, verbose = 2, floating=False, scoring='accuracy', cv=0)
sfal = sfal.fit(X,y)

In [None]:
sfal.subsets_

In [None]:
sfal.get_metric_dict()

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig1 = plot_sfs(sfal.get_metric_dict(confidence_interval = 0.95), kind = 'std_err')

plt.title('Seqential Forward Selection')
plt.grid()
plt.show()

In [None]:
df = pd.DataFrame.from_dict(sfal.get_metric_dict()).T
df[['feature_idx', 'avg_score']]

The Reason we haven't executed the below code blocks are, the above feature selection process has taken lot of hours and i had left us to drain the ram and forced us to shutdown the system.

In [None]:
# This will give us the best features selected by the sequential feature selector model.
sfal.k_feature_names_

In [None]:
cols = ['Start_Lng', 'End_Lng', 'Distance(mi)', 'Side', 'City', 'Temperature(F)', 'Astronomical_Twilight']

In [None]:
test_var = new_df[cols]
tar_var = new_df['Severity']

In [None]:
X_train, y_train, X_test, y_test = train_test_split(test_var, tar_var, test_size=0.3, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth =5, random_state = 42)
dt.fit(X_train, y_train)

In [None]:
print_score(dt, X_train, y_train, X_test, y_test, train=True)
print_score(dt, X_train, y_train, X_test, y_test, train=False)