# Intelligent Model: Random Forest Classifier

In [1]:
import pandas as pd

### Read in query outputs to dataframes

In [2]:
crime_df = pd.read_csv('crime_query.csv')
label_df = pd.read_csv('label_query.csv')
h_df = pd.read_csv('hosp_query.csv')
f_df = pd.read_csv('fire_query.csv')
pop_df = pd.read_csv('census_populations.csv')

In [3]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get crime count by county and join to crime_df

In [4]:
tmp_df4 = crime_df.groupby(['county']).incident_id.agg('count')
tmp_df4 = tmp_df4.to_frame().reset_index()
tmp_df4 = tmp_df4.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df4

Unnamed: 0,county,crime_cnt
0,BELL,10240
1,BEXAR,35
2,BRAZORIA,9487
3,BREWSTER,61
4,BURNET,1303
5,CALHOUN,1634
6,CAMERON,3
7,CHEROKEE,396
8,COKE,2
9,COLLIN,55442


In [5]:
crime_df = crime_df.join(tmp_df4.set_index('county'), on='county')

In [6]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get hospital and bed counts by county and join to h_df

In [7]:
h_df.head()

Unnamed: 0,county,hosp_id,beds
0,HARRIS,25577030,1082.0
1,JONES,3379501,45.0
2,COLLIN,475013,73.0
3,POTTER,679106,451.0
4,BRAZORIA,1177515,64.0


In [8]:
h_df['beds'] = h_df['beds'].clip_lower(1)

In [9]:
tmp_df = h_df.groupby(['county']).hosp_id.agg('count')
tmp_df = tmp_df.to_frame().reset_index()
tmp_df = tmp_df.rename({'hosp_id': 'hosp_cnt'}, axis='columns')
tmp_df

Unnamed: 0,county,hosp_cnt
0,ANDERSON,1
1,ANDREWS,1
2,ANGELINA,2
3,ATASCOSA,1
4,AUSTIN,1
5,BAILEY,1
6,BASTROP,1
7,BAYLOR,1
8,BEE,1
9,BELL,7


In [10]:
tmp_df2 = h_df.groupby(['county']).beds.agg('sum')
tmp_df2 = tmp_df2.to_frame().reset_index()
tmp_df2 = tmp_df2.rename({'beds': 'bed_cnt'}, axis='columns')
tmp_df2

Unnamed: 0,county,bed_cnt
0,ANDERSON,86.0
1,ANDREWS,34.0
2,ANGELINA,420.0
3,ATASCOSA,67.0
4,AUSTIN,32.0
5,BAILEY,25.0
6,BASTROP,8.0
7,BAYLOR,49.0
8,BEE,69.0
9,BELL,896.0


In [11]:
h_df = tmp_df.join(tmp_df2.set_index('county'), on='county')
h_df

Unnamed: 0,county,hosp_cnt,bed_cnt
0,ANDERSON,1,86.0
1,ANDREWS,1,34.0
2,ANGELINA,2,420.0
3,ATASCOSA,1,67.0
4,AUSTIN,1,32.0
5,BAILEY,1,25.0
6,BASTROP,1,8.0
7,BAYLOR,1,49.0
8,BEE,1,69.0
9,BELL,7,896.0


### Get fire station count by county and join to f_df

In [12]:
f_df.head()

Unnamed: 0,county,fire_id
0,EL PASO,10410064
1,EL PASO,10410225
2,EL PASO,10139838
3,EL PASO,10139423
4,EL PASO,10410065


In [13]:
tmp_df3 = f_df.groupby(['county']).fire_id.agg('count')
tmp_df3 = tmp_df3.to_frame().reset_index()
tmp_df3 = tmp_df3.rename({'fire_id': 'fire_cnt'}, axis='columns')
tmp_df3

Unnamed: 0,county,fire_cnt
0,ANDERSON,21
1,ANDREWS,1
2,ANGELINA,18
3,ARANSAS,5
4,ARCHER,7
5,ARMSTRONG,2
6,ATASCOSA,2
7,AUSTIN,5
8,BAILEY,1
9,BANDERA,6


### Merge hospital dataframe and fire station dataframe

In [14]:
hf_df = tmp_df3.join(h_df.set_index('county'), on='county')
hf_df

Unnamed: 0,county,fire_cnt,hosp_cnt,bed_cnt
0,ANDERSON,21,1.0,86.0
1,ANDREWS,1,1.0,34.0
2,ANGELINA,18,2.0,420.0
3,ARANSAS,5,,
4,ARCHER,7,,
5,ARMSTRONG,2,,
6,ATASCOSA,2,1.0,67.0
7,AUSTIN,5,1.0,32.0
8,BAILEY,1,1.0,25.0
9,BANDERA,6,,


### Join to training dataframe

In [15]:
train_df = crime_df.join(hf_df.set_index('county'), on='county')

In [16]:
train_df.head(3).T

Unnamed: 0,0,1,2
incident_id,67693343,67693344,67693345
year,2013,2013,2013
age_num,46,57,51
victim_sex,F,M,F
crime_against,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses
location_id,20,18,18
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage
county,POTTER,POTTER,POTTER


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463095 entries, 0 to 463094
Data columns (total 17 columns):
incident_id               463095 non-null int64
year                      463095 non-null int64
age_num                   456081 non-null float64
victim_sex                463095 non-null object
crime_against             463095 non-null object
offense                   463095 non-null object
offense_category          463095 non-null object
location_id               463095 non-null int64
location_name             463095 non-null object
county                    463095 non-null object
population_description    463095 non-null object
officers                  463095 non-null int64
civilians                 463095 non-null int64
crime_cnt                 463095 non-null int64
fire_cnt                  463095 non-null int64
hosp_cnt                  461467 non-null float64
bed_cnt                   461467 non-null float64
dtypes: float64(3), int64(7), object(7)
memory usage: 60.1+ 

### Get crime count by county for label_df

In [18]:
tmp_df5 = label_df.groupby(['county']).incident_id.agg('count')
tmp_df5 = tmp_df5.to_frame().reset_index()
tmp_df5 = tmp_df5.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df5

Unnamed: 0,county,crime_cnt
0,BELL,3348
1,BEXAR,400
2,BRAZORIA,3043
3,BREWSTER,70
4,BURNET,343
5,CALHOUN,772
6,CAMERON,2
7,COLLIN,16937
8,CORYELL,115
9,DALLAS,4417


### Join to testing dataframe

In [19]:
label_df = label_df.join(tmp_df5.set_index('county'), on='county')

In [20]:
label_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,87264241,87264244,87264248,87264256,87264264
year,2016,2016,2016,2016,2016
age_num,59,50,26,25,26
victim_sex,F,M,F,F,M
crime_against,Property,Property,Property,Person,Property
offense,Theft From Motor Vehicle,Credit Card/Automated Teller Machine Fraud,All Other Larceny,Simple Assault,All Other Larceny
offense_category,Larceny/Theft Offenses,Fraud Offenses,Larceny/Theft Offenses,Assault Offenses,Larceny/Theft Offenses
location_id,18,20,20,20,20
location_name,Parking/Drop Lot/Garage,Residence/Home,Residence/Home,Residence/Home,Residence/Home
county,BURNET,BURNET,BURNET,CAMERON,CAMERON


### Join hospital and fire station counts to testing dataframe

In [21]:
test_df = label_df.join(hf_df.set_index('county'), on='county')

In [22]:
test_df.head(3).T

Unnamed: 0,0,1,2
incident_id,87264241,87264244,87264248
year,2016,2016,2016
age_num,59,50,26
victim_sex,F,M,F
crime_against,Property,Property,Property
offense,Theft From Motor Vehicle,Credit Card/Automated Teller Machine Fraud,All Other Larceny
offense_category,Larceny/Theft Offenses,Fraud Offenses,Larceny/Theft Offenses
location_id,18,20,20
location_name,Parking/Drop Lot/Garage,Residence/Home,Residence/Home
county,BURNET,BURNET,BURNET


In [23]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162550 entries, 0 to 162549
Data columns (total 17 columns):
incident_id               162550 non-null int64
year                      162550 non-null int64
age_num                   160057 non-null float64
victim_sex                162550 non-null object
crime_against             162550 non-null object
offense                   162550 non-null object
offense_category          162550 non-null object
location_id               162550 non-null int64
location_name             162550 non-null object
county                    162550 non-null object
population_description    162550 non-null object
officers                  162550 non-null int64
civilians                 162550 non-null int64
crime_cnt                 162550 non-null int64
fire_cnt                  162550 non-null int64
hosp_cnt                  161323 non-null float64
bed_cnt                   161323 non-null float64
dtypes: float64(3), int64(7), object(7)
memory usage: 21.1+ 

### Bring in population feature

In [24]:
pop_df.head()

Unnamed: 0,county,year,population
0,Anderson,2010,58477
1,Anderson,2010,58477
2,Anderson,2010,58477
3,Anderson,2010,58477
4,Anderson,2011,58379


In [25]:
pop_df['county'] = pop_df['county'].str.upper()

In [26]:
pop_df.head()

Unnamed: 0,county,year,population
0,ANDERSON,2010,58477
1,ANDERSON,2010,58477
2,ANDERSON,2010,58477
3,ANDERSON,2010,58477
4,ANDERSON,2011,58379


In [27]:
train_df =pd.merge(train_df, pop_df, how = 'outer', on=['county','year'])
train_df

Unnamed: 0,incident_id,year,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,population_description,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt,population
0,67693343.0,2013,46.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
1,67693343.0,2013,46.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
2,67693344.0,2013,57.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
3,67693344.0,2013,57.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
4,67693345.0,2013,51.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
5,67693345.0,2013,51.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
6,67693346.0,2013,24.0,M,Property,All Other Larceny,Larceny/Theft Offenses,5.0,Commercial/Office Building,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
7,67693346.0,2013,24.0,M,Property,All Other Larceny,Larceny/Theft Offenses,5.0,Commercial/Office Building,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
8,67693348.0,2013,49.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
9,67693348.0,2013,49.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0


In [28]:
train_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [29]:
train_df

Unnamed: 0,incident_id,year,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,population_description,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt,population
0,67693343.0,2013,46.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
1,67693343.0,2013,46.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
2,67693344.0,2013,57.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
3,67693344.0,2013,57.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
4,67693345.0,2013,51.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
5,67693345.0,2013,51.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
6,67693346.0,2013,24.0,M,Property,All Other Larceny,Larceny/Theft Offenses,5.0,Commercial/Office Building,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
7,67693346.0,2013,24.0,M,Property,All Other Larceny,Larceny/Theft Offenses,5.0,Commercial/Office Building,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
8,67693348.0,2013,49.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0
9,67693348.0,2013,49.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,"Cities from 100,000 thru 249,000",343.0,68.0,47721.0,12.0,5.0,878.0,122088.0


In [30]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 925402 entries, 0 to 925401
Data columns (total 18 columns):
incident_id               925402 non-null float64
year                      925402 non-null int64
age_num                   911407 non-null float64
victim_sex                925402 non-null object
crime_against             925402 non-null object
offense                   925402 non-null object
offense_category          925402 non-null object
location_id               925402 non-null float64
location_name             925402 non-null object
county                    925402 non-null object
population_description    925402 non-null object
officers                  925402 non-null float64
civilians                 925402 non-null float64
crime_cnt                 925402 non-null float64
fire_cnt                  925402 non-null float64
hosp_cnt                  922934 non-null float64
bed_cnt                   922934 non-null float64
population                924614 non-null float6

In [31]:
test_df =pd.merge(test_df, pop_df, how = 'outer', on=['county','year'])
test_df

Unnamed: 0,incident_id,year,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,population_description,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt,population
0,87264241.0,2016,59.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
1,87264241.0,2016,59.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
2,87264244.0,2016,50.0,M,Property,Credit Card/Automated Teller Machine Fraud,Fraud Offenses,20.0,Residence/Home,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
3,87264244.0,2016,50.0,M,Property,Credit Card/Automated Teller Machine Fraud,Fraud Offenses,20.0,Residence/Home,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
4,87264248.0,2016,26.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
5,87264248.0,2016,26.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
6,87264268.0,2016,41.0,M,Person,Simple Assault,Assault Offenses,36.0,Industrial Site,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
7,87264268.0,2016,41.0,M,Person,Simple Assault,Assault Offenses,36.0,Industrial Site,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
8,87264307.0,2016,33.0,M,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property,13.0,Highway/Road/Alley/Street/Sidewalk,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0
9,87264307.0,2016,33.0,M,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property,13.0,Highway/Road/Alley/Street/Sidewalk,BURNET,"Cities from 2,500 thru 9,999",19.0,13.0,343.0,10.0,2.0,71.0,45914.0


In [32]:
test_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [33]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324535 entries, 0 to 324534
Data columns (total 18 columns):
incident_id               324535 non-null float64
year                      324535 non-null int64
age_num                   319574 non-null float64
victim_sex                324535 non-null object
crime_against             324535 non-null object
offense                   324535 non-null object
offense_category          324535 non-null object
location_id               324535 non-null float64
location_name             324535 non-null object
county                    324535 non-null object
population_description    324535 non-null object
officers                  324535 non-null float64
civilians                 324535 non-null float64
crime_cnt                 324535 non-null float64
fire_cnt                  324535 non-null float64
hosp_cnt                  322613 non-null float64
bed_cnt                   322613 non-null float64
population                323970 non-null float6

## Pass training and testing dataframes through feature engineering script

In [34]:
from Features_Script import feature_engineering

trn_df, tst_df = feature_engineering(train_df, test_df)

### Do the feature engineering manually in jupyter

In [35]:
# Drop NA in victim age column (<2%)
train_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
train_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
train_df.dropna(axis=0, subset=['population'], inplace=True)

# Create Ratio Columns
train_df['crime_pop_ratio'] = train_df['crime_cnt'] / train_df['population']
train_df['beds_pop_ratio'] = train_df['bed_cnt'] / train_df['population']
train_df['beds_crime_ratio'] = train_df['bed_cnt'] / train_df['crime_cnt']
train_df['fire_pop_ratio'] = train_df['fire_cnt'] / train_df['population']
train_df['fire_crime_ratio'] = train_df['fire_cnt'] / train_df['crime_cnt']

# Reduce df to only desired features to train/test model
train_df = train_df[['age_num', 'victim_sex', 'offense_category', 'location_id', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
train_df = pd.get_dummies(train_df, columns=['victim_sex', 'offense_category', 'location_id', 'population_description'])

In [36]:
# Drop NA in victim age column (<2%)
test_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
test_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
test_df.dropna(axis=0, subset=['population'], inplace=True)

# Create Ratio Columns
test_df['crime_pop_ratio'] = test_df['crime_cnt'] / test_df['population']
test_df['beds_pop_ratio'] = test_df['bed_cnt'] / test_df['population']
test_df['beds_crime_ratio'] = test_df['bed_cnt'] / test_df['crime_cnt']
test_df['fire_pop_ratio'] = test_df['fire_cnt'] / test_df['population']
test_df['fire_crime_ratio'] = test_df['fire_cnt'] / test_df['crime_cnt']

# Reduce df to only desired features to train/test model
test_df = test_df[['age_num', 'victim_sex', 'offense_category', 'location_id', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
test_df = pd.get_dummies(test_df, columns=['victim_sex', 'offense_category', 'location_id', 'population_description'])

### Confirm script and manual produce identical results

In [37]:
train_df.equals(trn_df)

True

In [38]:
test_df.equals(tst_df)

True

### Split data into training and testing sets

In [39]:
y_train = trn_df.pop('county').values
X_train = trn_df.values
y_test = tst_df.pop('county').values
X_test = tst_df.values

### Confirm shapes match

In [40]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (909046, 91)
Training Labels Shape: (909046,)
Testing Features Shape: (317694, 91)
Testing Labels Shape: (317694,)


## Random Forest Classifier

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

  from numpy.core.umath_tests import inner1d


### Train Model

In [42]:
clf = RandomForestClassifier(oob_score=True, n_estimators=100, max_depth=50, max_features='auto')
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

### Get Predictions

In [43]:
preds = clf.predict(X_test)
proba_preds = clf.predict_proba(X_test)

### Calculate Scores

In [44]:
# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

score = clf.score(X_test, y_test)
oob=clf.oob_score_
acc = accuracy_score(y_test, preds)


print('Precision Scores:', precision_micro, precision_weighted)
print('Recall Scores:', recall_micro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_weighted)
print('Score:', score)
print('oob Score:', oob)
print('Accuracy Score:', acc)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Precision Scores: 0.8516685867532909 0.8618213509666313
Recall Scores: 0.8516685867532909 0.8516685867532909
F 1 Scores: 0.851668586753291 0.8567148902995616
oob Score: 1.0
Accuracy Score: 0.8516685867532909


### Feature Importances

In [45]:
import operator
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(trn_df.columns, clf.feature_importances_):
    feats[feature] = importance
feats = sorted(feats.items(), key = operator.itemgetter(1), reverse=True)
feats

[('fire_crime_ratio', 0.1375249652157433),
 ('civilians', 0.1343632093567462),
 ('officers', 0.13435318685952097),
 ('crime_pop_ratio', 0.11329065761107428),
 ('beds_crime_ratio', 0.10906238943709269),
 ('beds_pop_ratio', 0.10070427286999693),
 ('fire_pop_ratio', 0.08993893872128997),
 ('population_description_Cities from 500,000 thru 999,999',
  0.06695150242822526),
 ('population_description_Cities from 100,000 thru 249,000',
  0.04295520602849134),
 ('population_description_Cities from 50,000 thru 99,000',
  0.02112187631675585),
 ('population_description_MSA counties from 25,000 thru 99,999',
  0.010426138628865),
 ('population_description_Cities from 25,000 thru 49,999',
  0.009682190672943658),
 ('population_description_Cities from 250,000 thru 499,999',
  0.007656964984387976),
 ('population_description_Cities from 10,000 thru 24,999',
  0.00559663075463441),
 ('population_description_Non-MSA counties from 25,000 thru 99,999',
  0.00529725536462732),
 ('population_description_No