In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import folium
from scipy import stats
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Data processing before random forest:

In [2]:
# load crime data and select two crime types:
crimes =  pd.read_csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", usecols=["Category", "Date", "Time", "PdDistrict"]) ## specify any columns you need
crimes = crimes[crimes["Category"].isin(['VEHICLE THEFT', 'FRAUD'])] # filter out the dataframe, you can plug any list of crimes
crimes["datetime"] = crimes.apply(lambda x: pd.to_datetime(x.Date + " " + x.Time).round("H").tz_localize("ETC/GMT-7"), axis = 1) 

In [3]:
# weather data
weather = pd.read_csv("weather_data.csv", parse_dates=["date"],
                date_parser=lambda x: pd.to_datetime(x).tz_convert(None).tz_localize("Etc/GMT+3").tz_convert("Etc/GMT-7"))

In [4]:
data = crimes

In [5]:
data['Month']=pd.DatetimeIndex(data['Date']).month
# Finding the hours the crime happend 
data['Hours'] = pd.DatetimeIndex(data['Time']).hour

data['DayOfWeek'] = pd.DatetimeIndex(data['Date']).dayofweek

le = preprocessing.LabelEncoder()
le.fit(data.PdDistrict)
data['PdDistrict_'] = le.transform(data.PdDistrict)
#PdDistrict = pd.get_dummies(data.PdDistrict, prefix='PdDistrict')
#data = pd.concat([data, PdDistrict], axis=1)
data.head()

Unnamed: 0,Category,Date,Time,PdDistrict,datetime,Month,Hours,DayOfWeek,PdDistrict_
0,VEHICLE THEFT,02/15/2004,02:00,SOUTHERN,2004-02-15 02:00:00+07:00,2,2,6,7
4,VEHICLE THEFT,07/29/2005,21:00,INGLESIDE,2005-07-29 21:00:00+07:00,7,21,4,2
5,FRAUD,10/08/2013,21:11,PARK,2013-10-08 21:00:00+07:00,10,21,1,5
53,FRAUD,04/15/2010,15:13,SOUTHERN,2010-04-15 15:00:00+07:00,4,15,3,7
54,VEHICLE THEFT,04/09/2004,23:00,NORTHERN,2004-04-09 23:00:00+07:00,4,23,4,4


In [7]:
data_burg = data[data.Category == 'VEHICLE THEFT']
data_fraud = data[data.Category == 'FRAUD']

data_b = data_burg.drop(['Category', 'Date', 'Time', 'PdDistrict'], axis = 1)
data_f = data_fraud.drop(['Category', 'Date', 'Time', 'PdDistrict'], axis = 1)
size = 40000
data_b = data_b.sample(n = size, random_state = 1)
data_f = data_f.sample(n = size, random_state = 1)

data_f['y'] = np.ones(size)
data_b['y'] = np.zeros(size)
df  = pd.concat([data_f, data_b])
df = df.sample(frac=1).reset_index(drop=True)
#df = df.reset_index()
df.head()

Unnamed: 0,datetime,Month,Hours,DayOfWeek,PdDistrict_,y
0,2004-07-16 20:00:00+07:00,7,19,4,2,0.0
1,2011-03-11 20:00:00+07:00,3,20,4,0,0.0
2,2011-11-25 19:00:00+07:00,11,19,4,4,1.0
3,2009-10-15 14:00:00+07:00,10,14,3,6,1.0
4,2017-10-21 21:00:00+07:00,10,21,5,1,0.0


## Random forest data processing:

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV



In [11]:
y = df['y']
X = df.drop(['datetime', 'y'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)



## no random search or CV

In [13]:
model = RandomForestClassifier(bootstrap = True, 
                               n_estimators= 1600, 
                               min_samples_split = 2, 
                               max_features = 'sqrt' , 
                               max_depth = 10, min_samples_leaf= 4,)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       n_estimators=1600)

In [14]:
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)

0.6843181818181818

### Random search 


In [10]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

model = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [11]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 75.1min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [12]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [13]:
predictions = rf_random.best_estimator_.predict(X_test)
accuracy_score(y_test, predictions)

0.6872727272727273

### CVgrid

In [14]:
model = RandomForestClassifier(random_state=42, criterion = 'gini', bootstrap=True, max_features ='sqrt')

param_grid = { 
    'n_estimators': [300, 400, 600],
    'min_samples_leaf': [2,5,7],
    'min_samples_split': [2,4,6],
    'max_depth': [5,7,10,15]
    
}
CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 3, n_jobs = -1, verbose = 2)
CV_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  7.0min finished


GridSearchCV(cv=3,
             estimator=RandomForestClassifier(max_features='sqrt',
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [5, 7, 10, 15],
                         'min_samples_leaf': [2, 5, 7],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [300, 400, 600]},
             verbose=2)

In [15]:
CV_rfc.best_params_

{'max_depth': 10,
 'min_samples_leaf': 7,
 'min_samples_split': 2,
 'n_estimators': 300}

In [16]:
predictions = CV_rfc.best_estimator_.predict(X_test)
accuracy_score(y_test, predictions)

0.6867045454545454

## Joining Weather



In [33]:
le = preprocessing.LabelEncoder()
le.fit(weather.weather)
weather['weather_'] = le.transform(weather.weather)
weather = weather.rename(columns ={'date': 'datetime'}, inplace = False)
weather.head()

Unnamed: 0,datetime,temperature,humidity,weather,wind_speed,wind_direction,pressure,weather_
0,2012-10-01 23:00:00+07:00,16.33,88.0,light rain,2.0,150.0,1009.0,10
1,2012-10-02 00:00:00+07:00,16.324993,87.0,sky is clear,2.0,147.0,1009.0,20
2,2012-10-02 01:00:00+07:00,16.310618,86.0,sky is clear,2.0,141.0,1009.0,20
3,2012-10-02 02:00:00+07:00,16.296243,85.0,sky is clear,2.0,135.0,1009.0,20
4,2012-10-02 03:00:00+07:00,16.281869,84.0,sky is clear,2.0,129.0,1009.0,20


In [34]:
df_all = pd.merge(df,weather, how='inner', on=['datetime'])

In [40]:
df_all=df_all.dropna()

X = df_all.drop(['y','datetime','weather','pressure','temperature','humidity'], axis = 1 )
y = df_all['y']

In [41]:
X.shape

(24595, 7)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [43]:
model = RandomForestClassifier(bootstrap = True, 
                               n_estimators= 1600, 
                               min_samples_split = 2, 
                               max_features = 'sqrt' , 
                               max_depth = 10, min_samples_leaf= 4,)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       n_estimators=1600)

In [44]:
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)

0.682764568190218

In [None]:
rf = RandomForestClassifier(random_state=42, criterion = 'gini')
param_grid = { 
    'n_estimators': [300, 400, 600],
    'min_samples_leaf': [2,5,7],
    'min_samples_split': [2,4,6],
    'max_depth': [5,7,10,15]
    
}
cv = 5
CV_rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_

In [None]:
predictions = CV_rfc.best_estimator_.predict(X_test)
accuracy_score(y_test, predictions)