In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [4]:
incident_df = pd.read_csv("incident_data/yearly_incident_data/all_incidents.csv",encoding='utf8',low_memory=False)
# Drop the null columns where all values are null
incident_df = incident_df.dropna(axis='columns', how='all')
# Drop the null rows
incident_df = incident_df.dropna()
incident_df.head()

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,neighborhood,incidentDate,incidentTime
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,Windom,2010/01/01,00:30
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,Windom,2010/01/01,01:05
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,Bryant,2010/01/01,04:18
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,Loring Park,2010/01/01,03:00
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,Folwell,2010/01/01,02:23


In [6]:
# Set features. This will also be used as your x values.
#selected_features = df.drop("koi_disposition", axis=1)

incident_df['incidentDate'] = pd.to_datetime(incident_df['incidentDate'], format='%Y/%m/%d')
incident_df['month']= incident_df['incidentDate'].dt.month 
incident_df['weekday'] =  incident_df['incidentDate'].dt.dayofweek
incident_df['day'] =  incident_df['incidentDate'].dt.day

Unnamed: 0,Armatage,Audubon Park,Bancroft,Beltrami,Bottineau,Bryant,Bryn - Mawr,Camden Industrial,Carag,Cedar - Isles - Dean,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235545,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
235546,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
235547,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
235548,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Get one hot encoding of columns 'vehicleType'
one_hot = pd.get_dummies(incident_df['neighborhood'])
# Drop column as it is now encoded
incident_df = incident_df.drop('neighborhood',axis = 1)
# Join the encoded df
incident_df = incident_df.join(one_hot)
incident_df

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,incidentDate,incidentTime,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,2010-01-01,00:30,...,0,0,0,0,0,0,0,0,1,0
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,2010-01-01,01:05,...,0,0,0,0,0,0,0,0,1,0
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,2010-01-01,04:18,...,0,0,0,0,0,0,0,0,0,0
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,2010-01-01,03:00,...,0,0,0,0,0,0,0,0,0,0
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,2010-01-01,02:23,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235545,00008X 8Th St S,MP2020708915,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,2020-12-24,14:45,...,0,0,0,0,0,0,0,0,0,0
235546,00008X 8Th St S,MP2020708916,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,2020-12-24,15:45,...,0,0,0,0,0,0,0,0,0,0
235547,0006Xx Washington Ave Se,MP2020708917,2,BIKETF,Bike Theft,7.0,44.973658,-93.229494,2020-12-22,11:00,...,0,0,0,0,0,0,0,0,0,0
235548,0025Xx 36Th Ave N,MP2020321240,4,THFTSW,Theft By Swindle,7.0,45.020468,-93.312715,2020-12-27,17:00,...,0,0,0,0,0,0,0,0,0,0


In [12]:
selected_features = incident_df.drop(["description","caseNumber","offense","ucrCode",
                                      "publicAddress","incidentDate","incidentTime"],axis=1)

selected_features.head(25)

Unnamed: 0,precinct,lat,lon,month,weekday,day,Armatage,Audubon Park,Bancroft,Beltrami,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
0,5,44.900291,-93.288239,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,5,44.890636,-93.280041,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,44.931448,-93.268841,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,44.970506,-93.277714,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,45.017746,-93.306988,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,3,44.938647,-93.274069,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,3,44.950635,-93.262651,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,5,44.94951,-93.314613,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,44.97642,-93.27268,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,44.948351,-93.269666,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X = selected_features
y = incident_df["ucrCode"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, train_size=0.65,test_size=0.35)
X_train.head()

Unnamed: 0,precinct,lat,lon,month,weekday,day,Armatage,Audubon Park,Bancroft,Beltrami,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
95270,5,44.916848,-93.293413,6,2,25,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98269,3,44.911683,-93.230967,8,6,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53431,3,44.922509,-93.20764,7,4,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
179553,2,44.966816,-93.224299,6,5,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
169549,3,44.947458,-93.253773,2,0,26,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
rf = RandomForestClassifier(n_estimators=500)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.5353716951006399

In [18]:
predictions = rf.predict(X_test_scaled)
print(predictions)

[6. 7. 7. ... 7. 7. 7.]


In [19]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.992302901474431
Testing Data Score: 0.5353716951006399


In [20]:
importances = rf.feature_importances_
importances

array([1.52725733e-02, 2.11757151e-01, 2.17275018e-01, 1.64373133e-01,
       1.06226223e-01, 2.45889605e-01, 1.24899798e-04, 2.35236262e-04,
       2.49099127e-04, 1.58615609e-04, 1.70160266e-04, 2.96618828e-04,
       2.65114545e-04, 1.25964426e-04, 3.89721399e-04, 2.08242800e-04,
       3.38594263e-04, 7.95916795e-04, 2.65261021e-04, 1.30470417e-04,
       4.40015968e-04, 2.75362462e-04, 3.35364992e-04, 2.02290058e-04,
       3.90383121e-04, 6.23192479e-03, 2.85095695e-04, 3.99490451e-04,
       7.55509767e-04, 3.95475651e-04, 3.83094883e-04, 3.54200704e-04,
       1.71972201e-04, 1.14210348e-03, 1.34950395e-04, 1.36483593e-04,
       3.88074024e-04, 9.79047376e-04, 4.91665634e-04, 2.56826778e-04,
       2.61280386e-04, 6.91632286e-05, 1.10810141e-03, 2.60393009e-04,
       1.55427826e-04, 1.15019181e-04, 3.70694945e-04, 3.92845503e-04,
       1.55084686e-04, 1.91560613e-04, 1.71630088e-03, 3.69929688e-04,
       2.55952474e-04, 7.76120786e-04, 4.40798695e-04, 2.26679950e-04,
      

In [21]:
feature_names = selected_features.columns

sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.24588960469565468, 'day'),
 (0.21727501819649192, 'lon'),
 (0.21175715117726315, 'lat'),
 (0.1643731334974903, 'month'),
 (0.10622622250357104, 'weekday'),
 (0.015272573258311673, 'precinct'),
 (0.006231924787583482, 'Downtown West'),
 (0.0017163008784274207, 'Longfellow'),
 (0.0013976471173375287, 'Willard - Hay'),
 (0.001142103483881137, 'Folwell'),
 (0.0011081014093957057, 'Jordan'),
 (0.0010607288831889158, 'Northeast Park'),
 (0.0009790473755639504, 'Hawthorne'),
 (0.0008127536701947659, 'Ventura Village'),
 (0.0007959167946681531, 'Central'),
 (0.000776120786052117, 'Lowry Hill East'),
 (0.0007555097665998999, 'East Phillips'),
 (0.0007476213938384912, 'Near - North'),
 (0.0007133995415716171, 'Mckinley'),
 (0.000604354564442762, 'Powderhorn Park'),
 (0.0005765000757814036, 'Whittier'),
 (0.0005183567905695671, 'Standish'),
 (0.0004916656338499381, 'Hiawatha'),
 (0.0004894439873181629, 'Nicollet Island - East Bank'),
 (0.00048578755455935954, 'Webber - Camden'),
 (0.000458957