In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [5]:
incident_df = pd.read_csv("../incident_data/yearly_incident_data/all_incidents.csv",encoding='utf8',low_memory=False)
# Drop the null columns where all values are null
incident_df = incident_df.dropna(axis='columns', how='all')
# Drop the null rows
incident_df = incident_df.dropna()
incident_df.head()

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,neighborhood,incidentDate,incidentTime
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,Windom,2010/01/01,00:30
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,Windom,2010/01/01,01:05
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,Bryant,2010/01/01,04:18
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,Loring Park,2010/01/01,03:00
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,Folwell,2010/01/01,02:23


In [7]:
# Set features. This will also be used as your x values.
#selected_features = df.drop("koi_disposition", axis=1)

incident_df['incidentDate'] = pd.to_datetime(incident_df['incidentDate'], format='%Y/%m/%d')
incident_df['month']= incident_df['incidentDate'].dt.month 
incident_df['weekday'] =  incident_df['incidentDate'].dt.dayofweek
incident_df['day'] =  incident_df['incidentDate'].dt.day

incident_df.head()

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,neighborhood,incidentDate,incidentTime,month,weekday,day
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,Windom,2010-01-01,00:30,1,4,1
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,Windom,2010-01-01,01:05,1,4,1
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,Bryant,2010-01-01,04:18,1,4,1
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,Loring Park,2010-01-01,03:00,1,4,1
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,Folwell,2010-01-01,02:23,1,4,1


In [8]:
# Get one hot encoding of columns 'vehicleType'
one_hot = pd.get_dummies(incident_df['neighborhood'])
# Drop column as it is now encoded
incident_df = incident_df.drop('neighborhood',axis = 1)
# Join the encoded df
incident_df = incident_df.join(one_hot)
incident_df

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,incidentDate,incidentTime,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,2010-01-01,00:30,...,0,0,0,0,0,0,0,0,1,0
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,2010-01-01,01:05,...,0,0,0,0,0,0,0,0,1,0
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,2010-01-01,04:18,...,0,0,0,0,0,0,0,0,0,0
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,2010-01-01,03:00,...,0,0,0,0,0,0,0,0,0,0
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,2010-01-01,02:23,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235545,00008X 8Th St S,MP2020708915,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,2020-12-24,14:45,...,0,0,0,0,0,0,0,0,0,0
235546,00008X 8Th St S,MP2020708916,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,2020-12-24,15:45,...,0,0,0,0,0,0,0,0,0,0
235547,0006Xx Washington Ave Se,MP2020708917,2,BIKETF,Bike Theft,7.0,44.973658,-93.229494,2020-12-22,11:00,...,0,0,0,0,0,0,0,0,0,0
235548,0025Xx 36Th Ave N,MP2020321240,4,THFTSW,Theft By Swindle,7.0,45.020468,-93.312715,2020-12-27,17:00,...,0,0,0,0,0,0,0,0,0,0


In [9]:
selected_features = incident_df.drop(["description","caseNumber","offense","ucrCode",
                                      "publicAddress","incidentDate","incidentTime"],axis=1)

selected_features.head(25)

Unnamed: 0,precinct,lat,lon,month,weekday,day,Armatage,Audubon Park,Bancroft,Beltrami,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
0,5,44.900291,-93.288239,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,5,44.890636,-93.280041,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,44.931448,-93.268841,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,44.970506,-93.277714,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,45.017746,-93.306988,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,3,44.938647,-93.274069,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,3,44.950635,-93.262651,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,5,44.94951,-93.314613,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,44.97642,-93.27268,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,44.948351,-93.269666,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X = selected_features
y = incident_df["ucrCode"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, train_size=0.65,test_size=0.35)
X_train.head()

Unnamed: 0,precinct,lat,lon,month,weekday,day,Armatage,Audubon Park,Bancroft,Beltrami,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
95270,5,44.916848,-93.293413,6,2,25,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98269,3,44.911683,-93.230967,8,6,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53431,3,44.922509,-93.20764,7,4,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
179553,2,44.966816,-93.224299,6,5,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
169549,3,44.947458,-93.253773,2,0,26,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
rf = RandomForestClassifier(n_estimators=500)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.5354722721615267

In [14]:
predictions = rf.predict(X_test_scaled)
print(predictions)

[6. 7. 7. ... 7. 7. 7.]


In [15]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.9922690531959545
Testing Data Score: 0.5354722721615267


In [16]:
importances = rf.feature_importances_
importances

array([1.49376345e-02, 2.11407477e-01, 2.16769091e-01, 1.64704659e-01,
       1.06899450e-01, 2.45683739e-01, 1.24117976e-04, 2.43022456e-04,
       2.65440019e-04, 1.53716112e-04, 1.77992573e-04, 3.02584548e-04,
       2.64625551e-04, 1.29459973e-04, 3.72766706e-04, 2.27419334e-04,
       3.44436078e-04, 8.03947442e-04, 2.66755388e-04, 1.30392177e-04,
       4.37531842e-04, 2.58891254e-04, 3.31684642e-04, 1.95298919e-04,
       3.76738588e-04, 6.41744138e-03, 2.87795231e-04, 4.15006960e-04,
       7.41309118e-04, 3.94862027e-04, 3.99782773e-04, 3.43030709e-04,
       1.76925532e-04, 1.23251601e-03, 1.36521597e-04, 1.38874615e-04,
       3.86152950e-04, 1.04693220e-03, 4.90049636e-04, 2.58498225e-04,
       2.57808200e-04, 6.89145686e-05, 1.07374705e-03, 2.51051817e-04,
       1.50693479e-04, 1.17859106e-04, 3.60872081e-04, 3.95233637e-04,
       1.55096956e-04, 1.94975135e-04, 1.72974090e-03, 3.85397108e-04,
       2.47591142e-04, 7.74285266e-04, 4.45714950e-04, 2.23469230e-04,
      

In [17]:
feature_names = selected_features.columns

sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.2456837389875522, 'day'),
 (0.21676909133413447, 'lon'),
 (0.2114074770826983, 'lat'),
 (0.16470465877182414, 'month'),
 (0.10689945015801589, 'weekday'),
 (0.014937634497474357, 'precinct'),
 (0.006417441376089715, 'Downtown West'),
 (0.0017297409030308448, 'Longfellow'),
 (0.0013699372440870677, 'Willard - Hay'),
 (0.0012325160114588563, 'Folwell'),
 (0.0011056932901280195, 'Northeast Park'),
 (0.0010737470519210307, 'Jordan'),
 (0.0010469322048756008, 'Hawthorne'),
 (0.0008176551613084473, 'Ventura Village'),
 (0.0008039474424305558, 'Central'),
 (0.0007742852655707239, 'Lowry Hill East'),
 (0.0007542085472182702, 'Near - North'),
 (0.0007514810956550237, 'Mckinley'),
 (0.0007413091184617163, 'East Phillips'),
 (0.000600035639852339, 'Powderhorn Park'),
 (0.0005594371008109567, 'Whittier'),
 (0.0005086434913316546, 'Nicollet Island - East Bank'),
 (0.0005002587318515844, 'Standish'),
 (0.0004900496356300798, 'Hiawatha'),
 (0.0004891616394185005, 'Webber - Camden'),
 (0.000472659

In [18]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [750, 1000, 1250],
              'max_depth': [1, 10, 50],
              'criterion': ['gini', 'entropy']}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] criterion=gini, max_depth=1, n_estimators=750 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_depth=1, n_estimators=750, score=0.549, total=  20.8s
[CV] criterion=gini, max_depth=1, n_estimators=750 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.7s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=750, score=0.549, total=  21.2s
[CV] criterion=gini, max_depth=1, n_estimators=750 ...................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   41.9s remaining:    0.0s


[CV]  criterion=gini, max_depth=1, n_estimators=750, score=0.549, total=  21.5s
[CV] criterion=gini, max_depth=1, n_estimators=750 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=750, score=0.549, total=  20.8s
[CV] criterion=gini, max_depth=1, n_estimators=750 ...................
[CV]  criterion=gini, max_depth=1, n_estimators=750, score=0.549, total=  20.9s
[CV] criterion=gini, max_depth=1, n_estimators=1000 ..................
[CV]  criterion=gini, max_depth=1, n_estimators=1000, score=0.549, total=  27.7s
[CV] criterion=gini, max_depth=1, n_estimators=1000 ..................
[CV]  criterion=gini, max_depth=1, n_estimators=1000, score=0.549, total=  27.5s
[CV] criterion=gini, max_depth=1, n_estimators=1000 ..................
[CV]  criterion=gini, max_depth=1, n_estimators=1000, score=0.549, total=  27.5s
[CV] criterion=gini, max_depth=1, n_estimators=1000 ..................
[CV]  criterion=gini, max_depth=1, n_estimators=1000, score=0.549, total=  27.5s
[CV] crite

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
import joblib
filename = 'RandomForest.sav'
joblib.dump(rf, filename)