In [77]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 55)
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import datetime

In [78]:
# My user can't connect to the database anymore, probably because I'm in Brazil
# So I'll try directly with the csvs instead 

## Data wrangling

In [79]:
# Get the database
date_df = pd.read_csv('Merged_Date_Final.csv')
location_df = pd.read_csv('Merged_Location_Final.csv')
crime_df = pd.read_csv('Merged_Crime_Final.csv')
fact_df = pd.read_csv('Fact_Final.csv')
df = pd.concat([date_df, location_df, crime_df, fact_df], axis=1)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [80]:
# Select a subset of the database
df = df.loc[df['City'] == "Vancouver"]
df = df.loc[df['Year'] == 2018]

In [81]:
# Drop columns that are not usefull for the classifier
dropped_columns = \
['Hour',
'Minute',
'Year',
'Longitude',
'Latitude',
'City',
'Crime-rate',
'Crime-report-time',
'Crime-start-time',
'Crime-end-time',
'Crime-key',
'Date-key',
'Location-key',
'Holiday-name',
'IS_CRIME',
'Crime-category',
'Total-neighborhood-population',
'Location-name',
'Years-0-to-4 ',
'Years-5-to-9',
'Years-10-to-14',
'Years-15-to-19',
'Years-15',
'Years-16',
'Years-17',
'Years-18 ',
'Years-19',
'Years-20-to-24',
'Years-25-to-29',
'Years-30-to-34',
'Years-35-to-39',
'Years-40-to-44',
'Years-45-to-49',
'Years-50-to-54',
'Years-55-to-59',
'Years-60-to-64',
'Years-65-to-69',
'Years-70-to-74',
'Years-75-to-79',
'Years-80-to-84',
'Day',
'Years-85-plus']
df.drop(columns=dropped_columns, inplace=True)

In [82]:
# Transform True False into integers
df['Is-Nighttime'] = df['Is-Nighttime'].astype(int)
df['Holiday'] = df['Holiday'].astype(int)
df['Weekend'] = df['Weekend'].astype(int)
df['Is-Traffic'] = df['Is-Traffic'].astype(int)
df['Is-Fatal'] = df['Is-Fatal'].astype(int)

In [83]:
# Check for class imbalance
Counter(df['Is-Nighttime'])

Counter({1: 21862, 0: 16223})

In [84]:
df

Unnamed: 0,Day-of-week,Holiday,Month,Weekend,Neighborhood,Crime-type,Crime-type-severity-index,Is-Traffic,Is-Fatal,Is-Nighttime
3,Friday,0,3,0,West End,Break and Enter Commercial,3,0,0,1
8,Saturday,0,6,1,West End,Break and Enter Commercial,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...
193373,Monday,0,8,0,Mount Pleasant,Vehicle Collision or Pedestrian Struck (with I...,5,1,0,0
193376,Saturday,0,8,1,Mount Pleasant,Vehicle Collision or Pedestrian Struck (with I...,5,1,0,0


In [85]:
#Get the labels
y = df['Is-Nighttime']
#Drop the column for the next step
df.drop(columns=['Is-Nighttime'], inplace=True)

In [86]:
# One-hot-encoding
df = pd.get_dummies(df, prefix=["Day-of-week", "Neighborhood", "Crime-type"])

In [87]:
# Get the features
X = df.values

In [88]:
df

Unnamed: 0,Holiday,Month,Weekend,Crime-type-severity-index,Is-Traffic,Is-Fatal,Day-of-week_Friday,Day-of-week_Monday,Day-of-week_Saturday,Day-of-week_Sunday,Day-of-week_Thursday,Day-of-week_Tuesday,Day-of-week_Wednesday,Neighborhood_Arbutus Ridge,Neighborhood_Central Business District,Neighborhood_Dunbar-Southlands,Neighborhood_Fairview,Neighborhood_Grandview-Woodland,Neighborhood_Hastings-Sunrise,Neighborhood_Kensington-Cedar Cottage,Neighborhood_Kerrisdale,Neighborhood_Killarney,Neighborhood_Kitsilano,Neighborhood_Marpole,Neighborhood_Mount Pleasant,Neighborhood_Musqueam,Neighborhood_Oakridge,Neighborhood_Renfrew-Collingwood,Neighborhood_Riley Park,Neighborhood_Shaughnessy,Neighborhood_South Cambie,Neighborhood_Stanley Park,Neighborhood_Strathcona,Neighborhood_Sunset,Neighborhood_Victoria-Fraserview,Neighborhood_West End,Neighborhood_West Point Grey,Crime-type_Break and Enter Commercial,Crime-type_Break and Enter Residential/Other,Crime-type_Homicide,Crime-type_Mischief,Crime-type_Offence Against a Person,Crime-type_Other Theft,Crime-type_Theft from Vehicle,Crime-type_Theft of Bicycle,Crime-type_Theft of Vehicle,Crime-type_Vehicle Collision or Pedestrian Struck (with Fatality),Crime-type_Vehicle Collision or Pedestrian Struck (with Injury)
3,0,3,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
8,0,6,1,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193373,0,8,0,5,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
193376,0,8,1,5,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [89]:
# Normalization
df=(df-df.mean())/df.std()

In [90]:
df

Unnamed: 0,Holiday,Month,Weekend,Crime-type-severity-index,Is-Traffic,Is-Fatal,Day-of-week_Friday,Day-of-week_Monday,Day-of-week_Saturday,Day-of-week_Sunday,Day-of-week_Thursday,Day-of-week_Tuesday,Day-of-week_Wednesday,Neighborhood_Arbutus Ridge,Neighborhood_Central Business District,Neighborhood_Dunbar-Southlands,Neighborhood_Fairview,Neighborhood_Grandview-Woodland,Neighborhood_Hastings-Sunrise,Neighborhood_Kensington-Cedar Cottage,Neighborhood_Kerrisdale,Neighborhood_Killarney,Neighborhood_Kitsilano,Neighborhood_Marpole,Neighborhood_Mount Pleasant,Neighborhood_Musqueam,Neighborhood_Oakridge,Neighborhood_Renfrew-Collingwood,Neighborhood_Riley Park,Neighborhood_Shaughnessy,Neighborhood_South Cambie,Neighborhood_Stanley Park,Neighborhood_Strathcona,Neighborhood_Sunset,Neighborhood_Victoria-Fraserview,Neighborhood_West End,Neighborhood_West Point Grey,Crime-type_Break and Enter Commercial,Crime-type_Break and Enter Residential/Other,Crime-type_Homicide,Crime-type_Mischief,Crime-type_Offence Against a Person,Crime-type_Other Theft,Crime-type_Theft from Vehicle,Crime-type_Theft of Bicycle,Crime-type_Theft of Vehicle,Crime-type_Vehicle Collision or Pedestrian Struck (with Fatality),Crime-type_Vehicle Collision or Pedestrian Struck (with Injury)
3,-0.179123,-1.025476,-0.640452,0.306377,-0.201709,-0.027124,2.354738,-0.415755,-0.418964,-0.406064,-0.398089,-0.396273,-0.397469,-0.090736,-0.631413,-0.11226,-0.222399,-0.220049,-0.185806,-0.194697,-0.105215,-0.122712,-0.222854,-0.149167,-0.259160,-0.021132,-0.115221,-0.220245,-0.152626,-0.104831,-0.087898,-0.063717,-0.234613,-0.161404,-0.126515,3.400714,-0.099316,4.219827,-0.25916,-0.019849,-0.421343,-0.300223,-0.386369,-0.806384,-0.245562,-0.176293,-0.018478,-0.200791
8,-0.179123,-0.159653,1.561355,0.306377,-0.201709,-0.027124,-0.424664,-0.415755,2.386777,-0.406064,-0.398089,-0.396273,-0.397469,-0.090736,-0.631413,-0.11226,-0.222399,-0.220049,-0.185806,-0.194697,-0.105215,-0.122712,-0.222854,-0.149167,-0.259160,-0.021132,-0.115221,-0.220245,-0.152626,-0.104831,-0.087898,-0.063717,-0.234613,-0.161404,-0.126515,3.400714,-0.099316,4.219827,-0.25916,-0.019849,-0.421343,-0.300223,-0.386369,-0.806384,-0.245562,-0.176293,-0.018478,-0.200791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193373,-0.179123,0.417562,-0.640452,1.375988,4.957512,-0.027124,-0.424664,2.405201,-0.418964,-0.406064,-0.398089,-0.396273,-0.397469,-0.090736,-0.631413,-0.11226,-0.222399,-0.220049,-0.185806,-0.194697,-0.105215,-0.122712,-0.222854,-0.149167,3.858526,-0.021132,-0.115221,-0.220245,-0.152626,-0.104831,-0.087898,-0.063717,-0.234613,-0.161404,-0.126515,-0.294048,-0.099316,-0.236970,-0.25916,-0.019849,-0.421343,-0.300223,-0.386369,-0.806384,-0.245562,-0.176293,-0.018478,4.980180
193376,-0.179123,0.417562,1.561355,1.375988,4.957512,-0.027124,-0.424664,-0.415755,2.386777,-0.406064,-0.398089,-0.396273,-0.397469,-0.090736,-0.631413,-0.11226,-0.222399,-0.220049,-0.185806,-0.194697,-0.105215,-0.122712,-0.222854,-0.149167,3.858526,-0.021132,-0.115221,-0.220245,-0.152626,-0.104831,-0.087898,-0.063717,-0.234613,-0.161404,-0.126515,-0.294048,-0.099316,-0.236970,-0.25916,-0.019849,-0.421343,-0.300223,-0.386369,-0.806384,-0.245562,-0.176293,-0.018478,4.980180


In [91]:
# Split the data intro training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

In [92]:
#Let's see the no. of records per class in training and test set
print("Training set {} ".format(Counter(y_train)))
print("Test set {} ".format(Counter(y_test)))

Training set Counter({1: 15303, 0: 11356}) 
Test set Counter({1: 6559, 0: 4867}) 


## Decision Tree

In [93]:
before = datetime.datetime.now()

In [94]:
#Fit the training data onto the decision tress classifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [95]:
after = datetime.datetime.now()
time_taken = after - before

In [96]:
y_pred = dt.predict(X_test)
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("precision of Decision Tree {:.2f} %".format(precision))
print("Time taken:", time_taken.total_seconds())

Recall of Decision Tree 64.88 %
precision of Decision Tree 63.52 %
Time taken: 0.248331


In [97]:
predictions = dt.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.52      0.54      0.53      4867
           1       0.65      0.64      0.64      6559

   micro avg       0.59      0.59      0.59     11426
   macro avg       0.59      0.59      0.59     11426
weighted avg       0.59      0.59      0.59     11426



In [98]:
print(confusion_matrix(y_test,predictions))

[[2612 2255]
 [2393 4166]]


## Random Forest

In [99]:
rf = RandomForestClassifier(n_estimators=600)

In [100]:
before = datetime.datetime.now()

In [101]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [102]:
after = datetime.datetime.now()
time_taken = after - before

In [103]:
y_pred = rf.predict(X_test)
recall = recall_score(y_pred, y_test) * 100
precision = precision_score(y_pred, y_test) * 100
print("Recall of Decision Tree {:.2f} %".format(recall))
print("precision of Decision Tree {:.2f} %".format(precision))
print("Time taken:", time_taken.total_seconds())

Recall of Decision Tree 64.51 %
precision of Decision Tree 69.26 %
Time taken: 23.884336


In [104]:
predictions = rf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.54      0.49      0.51      4867
           1       0.65      0.69      0.67      6559

   micro avg       0.60      0.60      0.60     11426
   macro avg       0.59      0.59      0.59     11426
weighted avg       0.60      0.60      0.60     11426



In [105]:
print(confusion_matrix(y_test,predictions))

[[2368 2499]
 [2016 4543]]


## Conclusion

The recall in the decision tree is sligtly better compared to the ensemble. On the other hand, precision is better on the ensemble. Overall, neither did very well, more feature engineering is needed.