In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, KBinsDiscretizer, MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import seaborn as sns
import warnings
from evidently.dashboard import Dashboard
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.dashboard.tabs import DataDriftTab, CatTargetDriftTab, NumTargetDriftTab, ClassificationPerformanceTab

sns.set()
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
#Load Data and encode to latin
acc = pd.read_csv('data/Accident_Information.csv', encoding = 'latin')
veh = pd.read_csv('data/Vehicle_Information.csv', encoding = 'latin')

# Merging two data sets into one with inner join by index
df = pd.merge(veh, acc, how = 'inner', on = 'Accident_Index')
df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
# df = df.dropna()
#Check data sample
print(df.shape)
# Creating weights that are opposite to the weights of target
weights = np.where(df['Accident_Severity'] == 'Slight', .12, .88)

#Sampling only 30% of the data with new weights  
df = df.sample(frac=0.3, replace=True, weights=weights)

print(df.shape)

In [None]:
df2 = df[['datetime', '1st_Road_Class','Day_of_Week', 'Junction_Detail','Light_Conditions', 'Number_of_Casualties',
          'Number_of_Vehicles', 'Road_Surface_Conditions', 'Road_Type', 'Special_Conditions_at_Site', 'Speed_limit',
          'Time', 'Urban_or_Rural_Area', 'Weather_Conditions', 'Age_Band_of_Driver', 'Age_of_Vehicle',
          'Hit_Object_in_Carriageway', 'Hit_Object_off_Carriageway', 'make', 'Engine_Capacity_.CC.', 'Sex_of_Driver',
          'Skidding_and_Overturning', 'Vehicle_Manoeuvre', 'Vehicle_Type', 'Accident_Severity'
         ]]
df2['Accident_Severity'] = df2['Accident_Severity'].replace(['Serious', 'Fatal'], 'Serious or Fatal')
df2 = pd.get_dummies(df2, columns=['Accident_Severity'])
df2 = df2.drop('Accident_Severity_Serious or Fatal', axis=1)
df2 = df2.set_index('datetime').sort_index()
df2.Accident_Severity_Slight.value_counts(normalize=True)

In [None]:
def get_Speed_limit(df):
    return df[['Speed_limit']]

def get_Time(df):
    return pd.to_datetime(df['Time'], format='%H:%M').dt.time

def find_time_group(time_object):
    if time_object<pd.datetime.time(pd.datetime(2000,1,1,5,0)):
        return 'Night'
    elif time_object<pd.datetime.time(pd.datetime(2000,1,1,7,0)):
        return 'Early Morning'
    elif time_object<pd.datetime.time(pd.datetime(2000,1,1,10,0)):
        return 'Morning'
    elif time_object<pd.datetime.time(pd.datetime(2000,1,1,15,0)):
        return 'Midday'
    elif time_object<pd.datetime.time(pd.datetime(2000,1,1,18,0)):
        return 'Afternoon'
    elif time_object<pd.datetime.time(pd.datetime(2000,1,1,20,0)):
        return 'Evening'
    elif time_object<=pd.datetime.time(pd.datetime(2000,1,1,23,59)):
        return 'Late Evening'
    return np.nan

def get_Age_of_Vehicle(df):
    return df[['Age_of_Vehicle']]

def get_Engine_Capacity(df):
    return df[['Engine_Capacity_.CC.']]

def get_columns_to_one_hot(df):
    return df[['1st_Road_Class', 'Day_of_Week', 'Junction_Detail', 'Light_Conditions', 'Number_of_Casualties', 
               'Number_of_Vehicles', 'Road_Surface_Conditions', 'Road_Type', 'Special_Conditions_at_Site', 
               'Urban_or_Rural_Area', 'Weather_Conditions', 'Age_Band_of_Driver', 'Hit_Object_in_Carriageway',
               'Hit_Object_off_Carriageway', 'Sex_of_Driver', 'Skidding_and_Overturning',
               'Vehicle_Manoeuvre', 'Vehicle_Type'
              ]]

def get_make(df):
    list_of_small_makers = list(df['make'].value_counts()[df['make'].value_counts() < 2000].index)
    return df['make'].replace(list_of_small_makers, 'Other').to_frame()


FullTransformerOnSpeedLimit = Pipeline([("Select_Speed_Limit", FunctionTransformer(func=get_Speed_limit, validate=False)),
                                        ("Fill_Null",          SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                        ("One_Hot_Encoder",    OneHotEncoder(sparse = False, handle_unknown='ignore'))
                                       ])


FullTransformerOnTime = Pipeline([("Select_Time",     FunctionTransformer(func=get_Time, validate=False)),
                                  ("Group_Time",      FunctionTransformer(func=lambda x: x.apply(find_time_group).to_frame(), validate=False)),
                                  ("Fill_Null",       SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                  ("One_Hot_Encoder", OneHotEncoder(sparse = False, handle_unknown='ignore'))
                                 ])

FullTransformerOnAgeofVehicle = Pipeline([("Select_Age_of_Vehicle", FunctionTransformer(func=get_Age_of_Vehicle, validate=False)),
                                          ("Fill_Null",             SimpleImputer(missing_values=np.nan, strategy='median'))
                                         ])



FullTransformerOnMake = Pipeline([("Select_Make",      FunctionTransformer(func=get_make, validate=False)),
                                   ("Fill_Null",       SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='Other')),
                                   ("One_Hot_Encoder", OneHotEncoder(sparse = False, handle_unknown='ignore'))])



FullTransformerOnEngineCapacity = Pipeline([("Select_Engine_Capacity",       FunctionTransformer(func=get_Engine_Capacity, validate=False)),
                                            ("Fill_Null",                    SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                            ("Car_Types_by_Engine_Capacity", KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='quantile')),
                                            ("One_Hot_Encoder",              OneHotEncoder(sparse = False, handle_unknown='ignore'))
                                           ])

DataToOneHotTransformerOnColumns = Pipeline([("Select_Columns",  FunctionTransformer(func=get_columns_to_one_hot, validate=False)),
                                             ("One_Hot_Encoder", OneHotEncoder(sparse = False, handle_unknown='ignore'))])

FeatureUnionTransformer = FeatureUnion([
                                        ("FTAgeofVehicle",   FullTransformerOnAgeofVehicle),
                                        ("FTEngineCapacity", FullTransformerOnEngineCapacity),
                                        ("FTMake",           FullTransformerOnMake),
                                        ("FTSpeedLimit",     FullTransformerOnSpeedLimit),
                                        ("FTTime",           FullTransformerOnTime),
                                        ("OHEColumns",       DataToOneHotTransformerOnColumns)])


In [None]:
%%time

print('All size', df2.shape)

training = df2.loc['2005-01-01 00:00:00':'2006-01-01 00:00:00'].copy()
testing = df2.loc['2006-01-01 00:00:00':'2006-03-01 00:00:00'].copy()
coming_months = [['2006-03-01 00:00:00','2006-06-01 00:00:00'],
                 ['2006-06-01 00:00:00','2006-09-01 00:00:00'],
                 ['2006-09-01 00:00:00','2006-12-01 00:00:00']]

print('training size', training.shape)
print('testing size', testing.shape)

X_train = training.drop(['Accident_Severity_Slight'], axis=1)
y_train = training.Accident_Severity_Slight

X_test = testing.drop(['Accident_Severity_Slight'], axis=1)
y_test = testing.Accident_Severity_Slight


RandomForest_Full_Estimator = Pipeline([
                                        ("Feature_Engineering", FeatureUnionTransformer),
                                        ("Min_Max_Transformer", MaxAbsScaler()),
                                        ("Clf",                 RandomForestClassifier(n_estimators=100, n_jobs=3))
                                       ])


RandomForest_Full_Estimator.fit(X_train, y_train)
train_hat = RandomForest_Full_Estimator.predict(X_train) 
pred_hat = RandomForest_Full_Estimator.predict(X_test)


training['prediction'] = train_hat
testing['prediction'] = pred_hat


In [None]:
column_mapping = ColumnMapping()

column_mapping.target = 'Accident_Severity_Slight'
column_mapping.prediction = 'prediction'
classification_perfomance_dashboard = Dashboard(tabs=[ClassificationPerformanceTab()])
classification_perfomance_dashboard.calculate(testing, None, column_mapping=column_mapping)
classification_perfomance_dashboard.show()

# Next Trimester

In [None]:
%%time

next_month = df2.loc[coming_months[0][0]:coming_months[0][1]].copy()

X_nm = next_month.drop(['Accident_Severity_Slight'], axis=1)
y_nm = next_month.Accident_Severity_Slight

nm_hat = RandomForest_Full_Estimator.predict(X_nm) 

next_month['prediction'] = nm_hat

classification_perfomance_dashboard.calculate(testing, next_month, column_mapping=column_mapping)
classification_perfomance_dashboard.show()

In [None]:
target_drift_dashboard = Dashboard(tabs=[CatTargetDriftTab()])
target_drift_dashboard.calculate(testing, next_month, column_mapping=column_mapping)
target_drift_dashboard.show()

# Next Trimester

In [None]:
%%time

next_month = df2.loc[coming_months[1][0]:coming_months[1][1]].copy()

X_nm = next_month.drop(['Accident_Severity_Slight'], axis=1)
y_nm = next_month.Accident_Severity_Slight

nm_hat = RandomForest_Full_Estimator.predict(X_nm) 

next_month['prediction'] = nm_hat

classification_perfomance_dashboard.calculate(testing, next_month, column_mapping=column_mapping)
classification_perfomance_dashboard.show()

In [None]:
target_drift_dashboard = Dashboard(tabs=[CatTargetDriftTab()])
target_drift_dashboard.calculate(testing, next_month, column_mapping=column_mapping)
target_drift_dashboard.show()

# Next Trimester

In [None]:
%%time

next_month = df2.loc[coming_months[2][0]:coming_months[2][1]].copy()

X_nm = next_month.drop(['Accident_Severity_Slight'], axis=1)
y_nm = next_month.Accident_Severity_Slight

nm_hat = RandomForest_Full_Estimator.predict(X_nm) 

next_month['prediction'] = nm_hat

classification_perfomance_dashboard.calculate(testing, next_month, column_mapping=column_mapping)
classification_perfomance_dashboard.show()

In [None]:
target_drift_dashboard = Dashboard(tabs=[CatTargetDriftTab()])
target_drift_dashboard.calculate(testing, next_month, column_mapping=column_mapping)
target_drift_dashboard.show()