# Intelligent Model: Random Forest Classifier

In [1]:
import pandas as pd

### Read in query outputs to dataframes

In [2]:
crime_df = pd.read_csv('crime_query.csv')
label_df = pd.read_csv('label_query.csv')
h_df = pd.read_csv('hosp_query.csv')
f_df = pd.read_csv('fire_query.csv')

In [3]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
date_part,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get crime count by county and join to crime_df

In [4]:
tmp_df4 = crime_df.groupby(['county']).incident_id.agg('count')
tmp_df4 = tmp_df4.to_frame().reset_index()
tmp_df4 = tmp_df4.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df4

Unnamed: 0,county,crime_cnt
0,BELL,10240
1,BEXAR,35
2,BRAZORIA,9487
3,BREWSTER,61
4,BURNET,1303
5,CALHOUN,1634
6,CAMERON,3
7,CHEROKEE,396
8,COKE,2
9,COLLIN,55442


In [5]:
crime_df = crime_df.join(tmp_df4.set_index('county'), on='county')

In [6]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
date_part,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get hospital and bed counts by county and join to h_df

In [7]:
h_df.head()

Unnamed: 0,county,hosp_id,beds
0,HARRIS,25577030,1082.0
1,JONES,3379501,45.0
2,COLLIN,475013,73.0
3,POTTER,679106,451.0
4,BRAZORIA,1177515,64.0


In [8]:
h_df['beds'] = h_df['beds'].clip_lower(1)

In [9]:
tmp_df = h_df.groupby(['county']).hosp_id.agg('count')
tmp_df = tmp_df.to_frame().reset_index()
tmp_df = tmp_df.rename({'hosp_id': 'hosp_cnt'}, axis='columns')
tmp_df

Unnamed: 0,county,hosp_cnt
0,ANDERSON,1
1,ANDREWS,1
2,ANGELINA,2
3,ATASCOSA,1
4,AUSTIN,1
5,BAILEY,1
6,BASTROP,1
7,BAYLOR,1
8,BEE,1
9,BELL,7


In [10]:
tmp_df2 = h_df.groupby(['county']).beds.agg('sum')
tmp_df2 = tmp_df2.to_frame().reset_index()
tmp_df2 = tmp_df2.rename({'beds': 'bed_cnt'}, axis='columns')
tmp_df2

Unnamed: 0,county,bed_cnt
0,ANDERSON,86.0
1,ANDREWS,34.0
2,ANGELINA,420.0
3,ATASCOSA,67.0
4,AUSTIN,32.0
5,BAILEY,25.0
6,BASTROP,8.0
7,BAYLOR,49.0
8,BEE,69.0
9,BELL,896.0


In [11]:
h_df = tmp_df.join(tmp_df2.set_index('county'), on='county')
h_df

Unnamed: 0,county,hosp_cnt,bed_cnt
0,ANDERSON,1,86.0
1,ANDREWS,1,34.0
2,ANGELINA,2,420.0
3,ATASCOSA,1,67.0
4,AUSTIN,1,32.0
5,BAILEY,1,25.0
6,BASTROP,1,8.0
7,BAYLOR,1,49.0
8,BEE,1,69.0
9,BELL,7,896.0


### Get fire station count by county and join to f_df

In [12]:
f_df.head()

Unnamed: 0,county,fire_id
0,EL PASO,10410064
1,EL PASO,10410225
2,EL PASO,10139838
3,EL PASO,10139423
4,EL PASO,10410065


In [13]:
tmp_df3 = f_df.groupby(['county']).fire_id.agg('count')
tmp_df3 = tmp_df3.to_frame().reset_index()
tmp_df3 = tmp_df3.rename({'fire_id': 'fire_cnt'}, axis='columns')
tmp_df3

Unnamed: 0,county,fire_cnt
0,ANDERSON,21
1,ANDREWS,1
2,ANGELINA,18
3,ARANSAS,5
4,ARCHER,7
5,ARMSTRONG,2
6,ATASCOSA,2
7,AUSTIN,5
8,BAILEY,1
9,BANDERA,6


### Merge hospital dataframe and fire station dataframe

In [14]:
hf_df = tmp_df3.join(h_df.set_index('county'), on='county')
hf_df

Unnamed: 0,county,fire_cnt,hosp_cnt,bed_cnt
0,ANDERSON,21,1.0,86.0
1,ANDREWS,1,1.0,34.0
2,ANGELINA,18,2.0,420.0
3,ARANSAS,5,,
4,ARCHER,7,,
5,ARMSTRONG,2,,
6,ATASCOSA,2,1.0,67.0
7,AUSTIN,5,1.0,32.0
8,BAILEY,1,1.0,25.0
9,BANDERA,6,,


### Join to training dataframe

In [15]:
train_df = crime_df.join(hf_df.set_index('county'), on='county')

In [16]:
train_df.head(3).T

Unnamed: 0,0,1,2
incident_id,67693343,67693344,67693345
date_part,2013,2013,2013
age_num,46,57,51
victim_sex,F,M,F
crime_against,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses
location_id,20,18,18
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage
county,POTTER,POTTER,POTTER


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463095 entries, 0 to 463094
Data columns (total 18 columns):
incident_id               463095 non-null int64
date_part                 463095 non-null int64
age_num                   456081 non-null float64
victim_sex                463095 non-null object
crime_against             463095 non-null object
offense                   463095 non-null object
offense_category          463095 non-null object
location_id               463095 non-null int64
location_name             463095 non-null object
county                    463095 non-null object
population                463095 non-null int64
population_description    463095 non-null object
officers                  463095 non-null int64
civilians                 463095 non-null int64
crime_cnt                 463095 non-null int64
fire_cnt                  463095 non-null int64
hosp_cnt                  461467 non-null float64
bed_cnt                   461467 non-null float64
dtypes: floa

In [18]:
train_df[train_df.population < 1]

Unnamed: 0,incident_id,date_part,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,population,population_description,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt
18989,67831964,2013,22.0,M,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18990,67831966,2013,18.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18991,67831968,2013,18.0,F,Property,Shoplifting,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18992,67831969,2013,21.0,M,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18993,67831971,2013,20.0,M,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18994,67831972,2013,18.0,F,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18995,67831973,2013,18.0,M,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18996,67831974,2013,19.0,M,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18997,67831975,2013,20.0,M,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0
18998,67831976,2013,23.0,M,Property,All Other Larceny,Larceny/Theft Offenses,22,School/College,SMITH,0,"Cities under 2,500",20,6,19368,25,5.0,1052.0


### Get crime count by county for label_df

In [19]:
tmp_df5 = label_df.groupby(['county']).incident_id.agg('count')
tmp_df5 = tmp_df5.to_frame().reset_index()
tmp_df5 = tmp_df5.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df5

Unnamed: 0,county,crime_cnt
0,BELL,3348
1,BEXAR,400
2,BRAZORIA,3043
3,BREWSTER,70
4,BURNET,343
5,CALHOUN,772
6,CAMERON,2
7,COLLIN,16937
8,CORYELL,115
9,DALLAS,4417


### Join to testing dataframe

In [20]:
label_df = label_df.join(tmp_df5.set_index('county'), on='county')

In [21]:
label_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,87264241,87264244,87264248,87264256,87264264
date_part,2016,2016,2016,2016,2016
age_num,59,50,26,25,26
victim_sex,F,M,F,F,M
crime_against,Property,Property,Property,Person,Property
offense,Theft From Motor Vehicle,Credit Card/Automated Teller Machine Fraud,All Other Larceny,Simple Assault,All Other Larceny
offense_category,Larceny/Theft Offenses,Fraud Offenses,Larceny/Theft Offenses,Assault Offenses,Larceny/Theft Offenses
location_id,18,20,20,20,20
location_name,Parking/Drop Lot/Garage,Residence/Home,Residence/Home,Residence/Home,Residence/Home
county,BURNET,BURNET,BURNET,CAMERON,CAMERON


### Join hospital and fire station counts to testing dataframe

In [22]:
test_df = label_df.join(hf_df.set_index('county'), on='county')

In [23]:
test_df.head(3).T

Unnamed: 0,0,1,2
incident_id,87264241,87264244,87264248
date_part,2016,2016,2016
age_num,59,50,26
victim_sex,F,M,F
crime_against,Property,Property,Property
offense,Theft From Motor Vehicle,Credit Card/Automated Teller Machine Fraud,All Other Larceny
offense_category,Larceny/Theft Offenses,Fraud Offenses,Larceny/Theft Offenses
location_id,18,20,20
location_name,Parking/Drop Lot/Garage,Residence/Home,Residence/Home
county,BURNET,BURNET,BURNET


In [24]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162550 entries, 0 to 162549
Data columns (total 18 columns):
incident_id               162550 non-null int64
date_part                 162550 non-null int64
age_num                   160057 non-null float64
victim_sex                162550 non-null object
crime_against             162550 non-null object
offense                   162550 non-null object
offense_category          162550 non-null object
location_id               162550 non-null int64
location_name             162550 non-null object
county                    162550 non-null object
population                162550 non-null int64
population_description    162550 non-null object
officers                  162550 non-null int64
civilians                 162550 non-null int64
crime_cnt                 162550 non-null int64
fire_cnt                  162550 non-null int64
hosp_cnt                  161323 non-null float64
bed_cnt                   161323 non-null float64
dtypes: floa

## Pass training and testing dataframes through feature engineering script

In [None]:
from Features_Script import feature_engineering

trn_df, tst_df = feature_engineering(train_df, test_df)

### Do the feature engineering manually in jupyter

In [None]:
# Drop NA in victim age column (<2%)
train_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
train_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Create Ratio Columns
train_df['crime_pop_ratio'] = train_df['crime_cnt'] / train_df['population']
train_df['beds_pop_ratio'] = train_df['bed_cnt'] / train_df['population']
train_df['beds_crime_ratio'] = train_df['bed_cnt'] / train_df['crime_cnt']
train_df['fire_pop_ratio'] = train_df['fire_cnt'] / train_df['population']
train_df['fire_crime_ratio'] = train_df['fire_cnt'] / train_df['crime_cnt']

# Reduce df to only desired features to train/test model
train_df = train_df[['age_num', 'victim_sex', 'offense_category', 'location_id', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
train_df = pd.get_dummies(train_df, columns=['victim_sex', 'offense_category', 'location_id', 'population_description'])

### Confirm script and manual produce identical results

In [None]:
train_df.equals(trn_df)

### Split data into training and testing sets

In [None]:
y_train = trn_df.pop('county').values
X_train = trn_df.values
y_test = tst_df.pop('county').values
X_test = tst_df.values

### Confirm shapes match

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, precision_recall_fscore_support

### Train Model

In [None]:
clf = RandomForestClassifier(oob_score=True, n_estimators=100, max_depth=50, max_features='auto')
clf.fit(X_train, y_train)

### Get Predictions

In [None]:
preds = clf.predict(X_test)
proba_preds = clf.predict_proba(X_test)

### Calculate Scores

In [None]:
# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_macro = precision_score(y_test, preds, average = 'macro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_macro = recall_score(y_test, preds, average = 'macro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_macro = 2 / ((1/precision_macro) + (1/recall_macro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

oob=clf.oob_score_
acc = accuracy_score(y_test, preds)

prfs_micro = precision_recall_fscore_support(y_test, preds, average='micro')
prfs_macro = precision_recall_fscore_support(y_test, preds, average='macro')
prfs_weighted = precision_recall_fscore_support(y_test, preds, average='weighted')


print('Precision Scores:', precision_micro, precision_macro, precision_weighted)
print('Recall Scores:', recall_micro, recall_macro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_macro, F_1_weighted)
print('oob Score:', oob)
print('Accuracy Score:', acc)
print('PRF_Support Scores:', prfs_micro, prfs_macro, prfs_weighted)