# Intelligent Model: Random Forest Classifier

In [1]:
import pandas as pd

### Read in query outputs to dataframes

In [2]:
crime_df = pd.read_csv('crime_query.csv')
label_df = pd.read_csv('label_query.csv')
h_df = pd.read_csv('hosp_query.csv')
f_df = pd.read_csv('fire_query.csv')
pop_df = pd.read_csv('census_population.csv')

In [3]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get crime count by county and join to crime_df

In [4]:
tmp_df4 = crime_df.groupby(['county']).incident_id.agg('count')
tmp_df4 = tmp_df4.to_frame().reset_index()
tmp_df4 = tmp_df4.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df4

Unnamed: 0,county,crime_cnt
0,BELL,10240
1,BEXAR,35
2,BRAZORIA,9487
3,BREWSTER,61
4,BURNET,1303
5,CALHOUN,1634
6,CAMERON,3
7,CHEROKEE,396
8,COKE,2
9,COLLIN,55442


In [5]:
crime_df = crime_df.join(tmp_df4.set_index('county'), on='county')

In [6]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get hospital and bed counts by county and join to h_df

In [7]:
h_df.head()

Unnamed: 0,county,hosp_id,beds
0,HARRIS,25577030,1082.0
1,JONES,3379501,45.0
2,COLLIN,475013,73.0
3,POTTER,679106,451.0
4,BRAZORIA,1177515,64.0


In [8]:
h_df['beds'] = h_df['beds'].clip_lower(1)

In [9]:
tmp_df = h_df.groupby(['county']).hosp_id.agg('count')
tmp_df = tmp_df.to_frame().reset_index()
tmp_df = tmp_df.rename({'hosp_id': 'hosp_cnt'}, axis='columns')
tmp_df

Unnamed: 0,county,hosp_cnt
0,ANDERSON,1
1,ANDREWS,1
2,ANGELINA,2
3,ATASCOSA,1
4,AUSTIN,1
5,BAILEY,1
6,BASTROP,1
7,BAYLOR,1
8,BEE,1
9,BELL,7


In [10]:
tmp_df2 = h_df.groupby(['county']).beds.agg('sum')
tmp_df2 = tmp_df2.to_frame().reset_index()
tmp_df2 = tmp_df2.rename({'beds': 'bed_cnt'}, axis='columns')
tmp_df2

Unnamed: 0,county,bed_cnt
0,ANDERSON,86.0
1,ANDREWS,34.0
2,ANGELINA,420.0
3,ATASCOSA,67.0
4,AUSTIN,32.0
5,BAILEY,25.0
6,BASTROP,8.0
7,BAYLOR,49.0
8,BEE,69.0
9,BELL,896.0


In [11]:
h_df = tmp_df.join(tmp_df2.set_index('county'), on='county')
h_df

Unnamed: 0,county,hosp_cnt,bed_cnt
0,ANDERSON,1,86.0
1,ANDREWS,1,34.0
2,ANGELINA,2,420.0
3,ATASCOSA,1,67.0
4,AUSTIN,1,32.0
5,BAILEY,1,25.0
6,BASTROP,1,8.0
7,BAYLOR,1,49.0
8,BEE,1,69.0
9,BELL,7,896.0


### Get fire station count by county and join to f_df

In [12]:
f_df.head()

Unnamed: 0,county,fire_id
0,EL PASO,10410064
1,EL PASO,10410225
2,EL PASO,10139838
3,EL PASO,10139423
4,EL PASO,10410065


In [13]:
tmp_df3 = f_df.groupby(['county']).fire_id.agg('count')
tmp_df3 = tmp_df3.to_frame().reset_index()
tmp_df3 = tmp_df3.rename({'fire_id': 'fire_cnt'}, axis='columns')
tmp_df3

Unnamed: 0,county,fire_cnt
0,ANDERSON,21
1,ANDREWS,1
2,ANGELINA,18
3,ARANSAS,5
4,ARCHER,7
5,ARMSTRONG,2
6,ATASCOSA,2
7,AUSTIN,5
8,BAILEY,1
9,BANDERA,6


### Merge hospital dataframe and fire station dataframe

In [14]:
hf_df = tmp_df3.join(h_df.set_index('county'), on='county')
hf_df

Unnamed: 0,county,fire_cnt,hosp_cnt,bed_cnt
0,ANDERSON,21,1.0,86.0
1,ANDREWS,1,1.0,34.0
2,ANGELINA,18,2.0,420.0
3,ARANSAS,5,,
4,ARCHER,7,,
5,ARMSTRONG,2,,
6,ATASCOSA,2,1.0,67.0
7,AUSTIN,5,1.0,32.0
8,BAILEY,1,1.0,25.0
9,BANDERA,6,,


### Join to training dataframe

In [15]:
train_df = crime_df.join(hf_df.set_index('county'), on='county')

In [16]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463095 entries, 0 to 463094
Data columns (total 16 columns):
incident_id         463095 non-null int64
year                463095 non-null int64
age_num             456081 non-null float64
victim_sex          463095 non-null object
crime_against       463095 non-null object
offense             463095 non-null object
offense_category    463095 non-null object
location_id         463095 non-null int64
location_name       463095 non-null object
county              463095 non-null object
officers            463095 non-null int64
civilians           463095 non-null int64
crime_cnt           463095 non-null int64
fire_cnt            463095 non-null int64
hosp_cnt            461467 non-null float64
bed_cnt             461467 non-null float64
dtypes: float64(3), int64(7), object(6)
memory usage: 56.5+ MB


### Get crime count by county for label_df

In [18]:
tmp_df5 = label_df.groupby(['county']).incident_id.agg('count')
tmp_df5 = tmp_df5.to_frame().reset_index()
tmp_df5 = tmp_df5.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df5

Unnamed: 0,county,crime_cnt
0,BELL,3348
1,BEXAR,400
2,BRAZORIA,3043
3,BREWSTER,70
4,BURNET,343
5,CALHOUN,772
6,CAMERON,2
7,COLLIN,16937
8,CORYELL,115
9,DALLAS,4417


### Join to testing dataframe

In [19]:
label_df = label_df.join(tmp_df5.set_index('county'), on='county')

In [20]:
label_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,87264241,87264244,87264248,87264256,87264264
year,2016,2016,2016,2016,2016
age_num,59,50,26,25,26
victim_sex,F,M,F,F,M
crime_against,Property,Property,Property,Person,Property
offense,Theft From Motor Vehicle,Credit Card/Automated Teller Machine Fraud,All Other Larceny,Simple Assault,All Other Larceny
offense_category,Larceny/Theft Offenses,Fraud Offenses,Larceny/Theft Offenses,Assault Offenses,Larceny/Theft Offenses
location_id,18,20,20,20,20
location_name,Parking/Drop Lot/Garage,Residence/Home,Residence/Home,Residence/Home,Residence/Home
county,BURNET,BURNET,BURNET,CAMERON,CAMERON


### Join hospital and fire station counts to testing dataframe

In [21]:
test_df = label_df.join(hf_df.set_index('county'), on='county')

In [22]:
test_df.head(3).T

Unnamed: 0,0,1,2
incident_id,87264241,87264244,87264248
year,2016,2016,2016
age_num,59,50,26
victim_sex,F,M,F
crime_against,Property,Property,Property
offense,Theft From Motor Vehicle,Credit Card/Automated Teller Machine Fraud,All Other Larceny
offense_category,Larceny/Theft Offenses,Fraud Offenses,Larceny/Theft Offenses
location_id,18,20,20
location_name,Parking/Drop Lot/Garage,Residence/Home,Residence/Home
county,BURNET,BURNET,BURNET


In [23]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162550 entries, 0 to 162549
Data columns (total 16 columns):
incident_id         162550 non-null int64
year                162550 non-null int64
age_num             160057 non-null float64
victim_sex          162550 non-null object
crime_against       162550 non-null object
offense             162550 non-null object
offense_category    162550 non-null object
location_id         162550 non-null int64
location_name       162550 non-null object
county              162550 non-null object
officers            162550 non-null int64
civilians           162550 non-null int64
crime_cnt           162550 non-null int64
fire_cnt            162550 non-null int64
hosp_cnt            161323 non-null float64
bed_cnt             161323 non-null float64
dtypes: float64(3), int64(7), object(6)
memory usage: 19.8+ MB


### Bring in population feature

In [24]:
pop_df.head()

Unnamed: 0,county,year,population
0,ANDERSON,2010,58477
1,ANDERSON,2011,58379
2,ANDERSON,2012,58036
3,ANDERSON,2013,57960
4,ANDERSON,2014,57837


In [25]:
pop_df['county'] = pop_df['county'].str.upper()
pop_df.drop_duplicates(['county','year'], inplace=True)

In [26]:
pop_df.head()

Unnamed: 0,county,year,population
0,ANDERSON,2010,58477
1,ANDERSON,2011,58379
2,ANDERSON,2012,58036
3,ANDERSON,2013,57960
4,ANDERSON,2014,57837


In [27]:
pop_df['population_description'] = pd.cut(pop_df['population'], [0, 25000, 100000, 500000, 10000000],
                                labels=['Under 25,000', '25,000 - 99,999', '100,000 - 499,999','Over 500,000'])

In [28]:
pop_df.head(10)

Unnamed: 0,county,year,population,population_description
0,ANDERSON,2010,58477,"25,000 - 99,999"
1,ANDERSON,2011,58379,"25,000 - 99,999"
2,ANDERSON,2012,58036,"25,000 - 99,999"
3,ANDERSON,2013,57960,"25,000 - 99,999"
4,ANDERSON,2014,57837,"25,000 - 99,999"
5,ANDERSON,2015,57641,"25,000 - 99,999"
6,ANDERSON,2016,57558,"25,000 - 99,999"
7,ANDERSON,2017,57741,"25,000 - 99,999"
8,ANDREWS,2010,14817,"Under 25,000"
9,ANDREWS,2011,15386,"Under 25,000"


In [29]:
pop_df['population_description'].value_counts()

Under 25,000         1188
25,000 - 99,999       500
100,000 - 499,999     225
Over 500,000           87
Name: population_description, dtype: int64

In [30]:
train_df =pd.merge(train_df, pop_df, how = 'outer', on=['county','year'])
train_df

Unnamed: 0,incident_id,year,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt,population,population_description
0,67693343.0,2013,46.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
1,67693344.0,2013,57.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
2,67693345.0,2013,51.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
3,67693346.0,2013,24.0,M,Property,All Other Larceny,Larceny/Theft Offenses,5.0,Commercial/Office Building,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
4,67693348.0,2013,49.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
5,67693350.0,2013,26.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
6,67693351.0,2013,44.0,F,Property,All Other Larceny,Larceny/Theft Offenses,13.0,Highway/Road/Alley/Street/Sidewalk,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
7,67693352.0,2013,62.0,F,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
8,67693353.0,2013,18.0,F,Person,Simple Assault,Assault Offenses,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
9,67693353.0,2013,51.0,M,Person,Simple Assault,Assault Offenses,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"


In [31]:
train_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [32]:
train_df

Unnamed: 0,incident_id,year,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt,population,population_description
0,67693343.0,2013,46.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
1,67693344.0,2013,57.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
2,67693345.0,2013,51.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
3,67693346.0,2013,24.0,M,Property,All Other Larceny,Larceny/Theft Offenses,5.0,Commercial/Office Building,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
4,67693348.0,2013,49.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
5,67693350.0,2013,26.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
6,67693351.0,2013,44.0,F,Property,All Other Larceny,Larceny/Theft Offenses,13.0,Highway/Road/Alley/Street/Sidewalk,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
7,67693352.0,2013,62.0,F,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
8,67693353.0,2013,18.0,F,Person,Simple Assault,Assault Offenses,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
9,67693353.0,2013,51.0,M,Person,Simple Assault,Assault Offenses,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"


In [33]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 463095 entries, 0 to 463094
Data columns (total 18 columns):
incident_id               463095 non-null float64
year                      463095 non-null int64
age_num                   456081 non-null float64
victim_sex                463095 non-null object
crime_against             463095 non-null object
offense                   463095 non-null object
offense_category          463095 non-null object
location_id               463095 non-null float64
location_name             463095 non-null object
county                    463095 non-null object
officers                  463095 non-null float64
civilians                 463095 non-null float64
crime_cnt                 463095 non-null float64
fire_cnt                  463095 non-null float64
hosp_cnt                  461467 non-null float64
bed_cnt                   461467 non-null float64
population                462307 non-null float64
population_description    462307 non-null categ

In [34]:
test_df =pd.merge(test_df, pop_df, how = 'outer', on=['county','year'])
test_df

Unnamed: 0,incident_id,year,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt,population,population_description
0,87264241.0,2016,59.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
1,87264244.0,2016,50.0,M,Property,Credit Card/Automated Teller Machine Fraud,Fraud Offenses,20.0,Residence/Home,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
2,87264248.0,2016,26.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
3,87264268.0,2016,41.0,M,Person,Simple Assault,Assault Offenses,36.0,Industrial Site,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
4,87264307.0,2016,33.0,M,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property,13.0,Highway/Road/Alley/Street/Sidewalk,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
5,87264319.0,2016,23.0,F,Property,False Pretenses/Swindle/Confidence Game,Fraud Offenses,8.0,Department/Discount Store,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
6,87264341.0,2016,48.0,F,Property,Credit Card/Automated Teller Machine Fraud,Fraud Offenses,14.0,Hotel/Motel/Etc.,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
7,87264368.0,2016,26.0,M,Property,Credit Card/Automated Teller Machine Fraud,Fraud Offenses,8.0,Department/Discount Store,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
8,87264408.0,2016,16.0,F,Person,Simple Assault,Assault Offenses,41.0,School-Elementary/Secondary,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"
9,87264412.0,2016,55.0,F,Property,All Other Larceny,Larceny/Theft Offenses,8.0,Department/Discount Store,BURNET,19.0,13.0,343.0,10.0,2.0,71.0,45914.0,"25,000 - 99,999"


In [35]:
test_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [36]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162550 entries, 0 to 162549
Data columns (total 18 columns):
incident_id               162550 non-null float64
year                      162550 non-null int64
age_num                   160057 non-null float64
victim_sex                162550 non-null object
crime_against             162550 non-null object
offense                   162550 non-null object
offense_category          162550 non-null object
location_id               162550 non-null float64
location_name             162550 non-null object
county                    162550 non-null object
officers                  162550 non-null float64
civilians                 162550 non-null float64
crime_cnt                 162550 non-null float64
fire_cnt                  162550 non-null float64
hosp_cnt                  161323 non-null float64
bed_cnt                   161323 non-null float64
population                161985 non-null float64
population_description    161985 non-null categ

## Pass training and testing dataframes through feature engineering script

In [37]:
from Scripts.Features_Script_Train_and_Test import feature_engineering

trn_df, tst_df = feature_engineering(train_df, test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train_df['crime_pop_ratio'] = train_df['crime_cnt'] / train_df['population']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train_df['beds_pop_ratio'] = train_df['bed_cnt'] / train_df['population']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train_df['beds_crime_ratio'] = train_df['bed_cnt'] / train_df['crime_cnt']
A value is trying to be set on

In [38]:
train_df.offense_category.value_counts()

Larceny/Theft Offenses                      145426
Assault Offenses                            117980
Destruction/Damage/Vandalism of Property     55336
Burglary/Breaking & Entering                 52531
Fraud Offenses                               33826
Motor Vehicle Theft                          18010
Robbery                                       8766
Sex Offenses                                  8518
Counterfeiting/Forgery                        4900
Drug/Narcotic Offenses                        4682
Kidnapping/Abduction                          1878
Weapon Law Violations                          889
Arson                                          628
Homicide Offenses                              427
Stolen Property Offenses                       387
Embezzlement                                   188
Extortion/Blackmail                             49
Pornography/Obscene Material                    45
Prostitution Offenses                           21
Human Trafficking              

In [39]:
test_df.offense_category.value_counts()

Larceny/Theft Offenses                      50292
Assault Offenses                            43556
Destruction/Damage/Vandalism of Property    19031
Burglary/Breaking & Entering                15738
Fraud Offenses                              11834
Motor Vehicle Theft                          6774
Robbery                                      3195
Sex Offenses                                 2915
Counterfeiting/Forgery                       1858
Drug/Narcotic Offenses                       1685
Kidnapping/Abduction                          807
Weapon Law Violations                         420
Arson                                         218
Stolen Property Offenses                      192
Homicide Offenses                             163
Embezzlement                                   66
Extortion/Blackmail                            41
Pornography/Obscene Material                   36
Human Trafficking                              18
Prostitution Offenses                           5


### Do the feature engineering manually in jupyter

In [40]:
# Drop NA in victim age column (<2%)
train_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
train_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
train_df.dropna(axis=0, subset=['population'], inplace=True)

# Drop offense_category_Gambling Offenses
train_df = train_df[train_df.offense_category != 'Gambling Offenses']

# Create Ratio Columns
train_df['crime_pop_ratio'] = train_df['crime_cnt'] / train_df['population']
train_df['beds_pop_ratio'] = train_df['bed_cnt'] / train_df['population']
train_df['beds_crime_ratio'] = train_df['bed_cnt'] / train_df['crime_cnt']
train_df['fire_pop_ratio'] = train_df['fire_cnt'] / train_df['population']
train_df['fire_crime_ratio'] = train_df['fire_cnt'] / train_df['crime_cnt']

# Reduce df to only desired features to train/test model
train_df = train_df[['age_num', 'victim_sex', 'offense_category', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
train_df = pd.get_dummies(train_df, columns=['victim_sex', 'offense_category', 'population_description'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pan

In [41]:
# Drop NA in victim age column (<2%)
test_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
test_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
test_df.dropna(axis=0, subset=['population'], inplace=True)

# Create Ratio Columns
test_df['crime_pop_ratio'] = test_df['crime_cnt'] / test_df['population']
test_df['beds_pop_ratio'] = test_df['bed_cnt'] / test_df['population']
test_df['beds_crime_ratio'] = test_df['bed_cnt'] / test_df['crime_cnt']
test_df['fire_pop_ratio'] = test_df['fire_cnt'] / test_df['population']
test_df['fire_crime_ratio'] = test_df['fire_cnt'] / test_df['crime_cnt']

# Reduce df to only desired features to train/test model
test_df = test_df[['age_num', 'victim_sex', 'offense_category', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
test_df = pd.get_dummies(test_df, columns=['victim_sex', 'offense_category', 'population_description'])

### Confirm script and manual produce identical results

In [42]:
train_df.equals(trn_df)

True

In [43]:
test_df.equals(tst_df)

True

### Split data into training and testing sets

In [44]:
y_train = trn_df.pop('county').values
X_train = trn_df.values
y_test = tst_df.pop('county').values
X_test = tst_df.values

### Confirm shapes match

In [45]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (454522, 36)
Training Labels Shape: (454522,)
Testing Features Shape: (158847, 36)
Testing Labels Shape: (158847,)


## Random Forest Classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

  from numpy.core.umath_tests import inner1d


### Train Model

In [47]:
clf = RandomForestClassifier(oob_score=True, n_estimators=50, max_depth=10, min_samples_split=2)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

### Get Predictions

In [48]:
preds = clf.predict(X_test)
proba_preds = clf.predict_proba(X_test)

### Calculate Scores

In [49]:
# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

score = clf.score(X_test, y_test)
oob=clf.oob_score_
acc = accuracy_score(y_test, preds)


print('Precision Scores:', precision_micro, precision_weighted)
print('Recall Scores:', recall_micro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_weighted)
print('Score:', score)
print('oob Score:', oob)
print('Accuracy Score:', acc)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Precision Scores: 0.8654176660560162 0.8512331276868459
Recall Scores: 0.8654176660560162 0.8654176660560162
F 1 Scores: 0.8654176660560162 0.8582667940590596
Score: 0.8654176660560162
oob Score: 0.9997865889879918
Accuracy Score: 0.8654176660560162


### Feature Importances

In [50]:
import operator
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(trn_df.columns, clf.feature_importances_):
    feats[feature] = importance
feats = sorted(feats.items(), key = operator.itemgetter(1), reverse=True)
feats

[('fire_crime_ratio', 0.1710038900924333),
 ('officers', 0.1617140981623154),
 ('crime_pop_ratio', 0.138223378531921),
 ('beds_pop_ratio', 0.11553927643158073),
 ('civilians', 0.10504486236968164),
 ('fire_pop_ratio', 0.10340217537894413),
 ('beds_crime_ratio', 0.10161207673931949),
 ('population_description_Over 500,000', 0.0505020332370935),
 ('population_description_100,000 - 499,999', 0.03392344830555656),
 ('population_description_25,000 - 99,999', 0.01600890499868405),
 ('population_description_Under 25,000', 0.0019378435432102026),
 ('offense_category_Fraud Offenses', 0.00028568421501627895),
 ('age_num', 0.00027914630668387575),
 ('offense_category_Burglary/Breaking & Entering', 0.00010109489291724609),
 ('offense_category_Assault Offenses', 8.794965718283851e-05),
 ('offense_category_Kidnapping/Abduction', 8.370520211592229e-05),
 ('offense_category_Destruction/Damage/Vandalism of Property',
  7.046665455738392e-05),
 ('offense_category_Larceny/Theft Offenses', 4.8902231627481

In [51]:
proba_preds[3]

array([8.07011965e-03, 1.66251039e-05, 0.00000000e+00, 1.10929575e-03,
       1.98678121e-01, 8.85964297e-03, 1.71323029e-05, 1.87488912e-01,
       9.06792120e-03, 8.20384935e-03, 0.00000000e+00, 3.12966856e-02,
       0.00000000e+00, 3.22062372e-04, 7.86303307e-02, 1.83771341e-03,
       1.01763098e-04, 4.92775969e-02, 1.24758319e-02, 1.26732775e-02,
       3.23479016e-03, 1.08951253e-02, 1.14549974e-02, 4.98526413e-03,
       2.83390062e-02, 4.27359844e-04, 3.40505130e-02, 6.62455349e-02,
       1.04547463e-02, 3.07117564e-02, 2.44981156e-02, 0.00000000e+00,
       2.77070541e-02, 5.37960224e-02, 1.55475749e-03, 0.00000000e+00,
       0.00000000e+00, 5.50164564e-03, 0.00000000e+00, 3.69817572e-02,
       9.85283792e-03, 3.11818354e-02])

In [52]:
preds[3]

'BURNET'

In [53]:
y_test[3]

'BURNET'

In [54]:
print(preds[0])
print(proba_preds[0])
print(clf.classes_)

BURNET
[8.07011965e-03 1.66251039e-05 0.00000000e+00 1.18079853e-03
 2.04985221e-01 6.67652756e-03 1.71323029e-05 1.25961256e-01
 9.06792120e-03 1.45804808e-02 1.69670603e-03 5.10687218e-02
 0.00000000e+00 4.43154234e-04 7.73467592e-02 3.43187127e-03
 1.11085659e-04 4.09236182e-02 1.17884347e-02 1.26732775e-02
 9.75221208e-03 1.26961214e-02 1.34647438e-02 4.98526413e-03
 3.50758483e-02 2.22716847e-03 3.41029301e-02 2.98401775e-02
 9.45436643e-03 3.92810225e-02 3.37720764e-02 0.00000000e+00
 3.47639527e-02 6.00968682e-02 1.55475749e-03 0.00000000e+00
 0.00000000e+00 1.12071195e-02 1.35770982e-02 2.73400434e-02
 1.36955641e-02 4.30729551e-02]
['BELL' 'BEXAR' 'BRAZORIA' 'BREWSTER' 'BURNET' 'CALHOUN' 'CAMERON'
 'CHEROKEE' 'COLLIN' 'DALLAS' 'DENTON' 'GRAYSON' 'GREGG' 'HARDEMAN'
 'HARDIN' 'HARRIS' 'HIDALGO' 'HUNT' 'JACKSON' 'JOHNSON' 'KAUFMAN' 'LAVACA'
 'LLANO' 'LUBBOCK' 'MCLENNAN' 'MILAM' 'MONTGOMERY' 'NACOGDOCHES' 'NOLAN'
 'PARKER' 'POLK' 'POTTER' 'ROCKWALL' 'RUSK' 'SMITH' 'TARRANT' 'TOM G

In [55]:
results_df = pd.DataFrame(proba_preds, columns = clf.classes_)

In [56]:
results_df.head()

Unnamed: 0,BELL,BEXAR,BRAZORIA,BREWSTER,BURNET,CALHOUN,CAMERON,CHEROKEE,COLLIN,DALLAS,...,ROCKWALL,RUSK,SMITH,TARRANT,TOM GREEN,TRAVIS,VICTORIA,WHARTON,WICHITA,WILLIAMSON
0,0.00807,1.7e-05,0.0,0.001181,0.204985,0.006677,1.7e-05,0.125961,0.009068,0.01458,...,0.034764,0.060097,0.001555,0.0,0.0,0.011207,0.013577,0.02734,0.013696,0.043073
1,0.017145,1.7e-05,0.011051,0.001573,0.1661,0.00228,1.4e-05,0.143329,0.009068,0.008204,...,0.031065,0.039556,0.003211,0.0,0.0,0.006897,0.013577,0.04034,0.012999,0.039532
2,0.00807,0.0,0.0,0.001229,0.205523,0.008264,1.5e-05,0.12529,0.009068,0.006377,...,0.034764,0.043858,0.002148,0.0,0.0,0.01085,0.013577,0.022654,0.012602,0.043796
3,0.00807,1.7e-05,0.0,0.001109,0.198678,0.00886,1.7e-05,0.187489,0.009068,0.008204,...,0.027707,0.053796,0.001555,0.0,0.0,0.005502,0.0,0.036982,0.009853,0.031182
4,0.00807,0.0,0.0,0.000986,0.182853,0.012019,1.1e-05,0.159833,0.00977,0.013135,...,0.036286,0.037557,0.000923,0.0,0.0,0.00737,0.013577,0.03549,0.01062,0.021749


In [57]:
results_df['Predictions'] = preds

In [58]:
results_df.head().T

Unnamed: 0,0,1,2,3,4
BELL,0.00807012,0.0171446,0.00807012,0.00807012,0.00807012
BEXAR,1.66251e-05,1.66251e-05,0,1.66251e-05,0
BRAZORIA,0,0.0110514,0,0,0
BREWSTER,0.0011808,0.0015731,0.00122871,0.0011093,0.000985543
BURNET,0.204985,0.1661,0.205523,0.198678,0.182853
CALHOUN,0.00667653,0.00227969,0.00826365,0.00885964,0.0120189
CAMERON,1.71323e-05,1.42363e-05,1.45746e-05,1.71323e-05,1.05626e-05
CHEROKEE,0.125961,0.143329,0.12529,0.187489,0.159833
COLLIN,0.00906792,0.00906792,0.00906792,0.00906792,0.00977018
DALLAS,0.0145805,0.00820385,0.00637663,0.00820385,0.0131351


In [59]:
results_df['Actual'] = y_test

In [60]:
results_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
BELL,0.00807012,0.0171446,0.00807012,0.00807012,0.00807012,0.0185157,0.0185157,0.0171446,0.00807012,0.00807012
BEXAR,1.66251e-05,1.66251e-05,0,1.66251e-05,0,0,1.66251e-05,0,0.000132504,1.66251e-05
BRAZORIA,0,0.0110514,0,0,0,0.0110514,0.0110514,0.0110514,0,0
BREWSTER,0.0011808,0.0015731,0.00122871,0.0011093,0.000985543,0.00173221,0.00174812,0.00155719,0.00126841,0.0011808
BURNET,0.204985,0.1661,0.205523,0.198678,0.182853,0.185262,0.183188,0.168924,0.225463,0.204985
CALHOUN,0.00667653,0.00227969,0.00826365,0.00885964,0.0120189,0.00227969,0.00227969,0.00227969,0.00885964,0.00667653
CAMERON,1.71323e-05,1.42363e-05,1.45746e-05,1.71323e-05,1.05626e-05,1.16786e-05,1.42363e-05,1.16786e-05,1.45746e-05,1.71323e-05
CHEROKEE,0.125961,0.143329,0.12529,0.187489,0.159833,0.12167,0.12313,0.141869,0.153931,0.125961
COLLIN,0.00906792,0.00906792,0.00906792,0.00906792,0.00977018,0.00906792,0.00906792,0.00906792,0.00906792,0.00906792
DALLAS,0.0145805,0.00820385,0.00637663,0.00820385,0.0131351,0,0.00820385,0,0.00594258,0.0145805


In [61]:
results_df.to_csv('Intelligent_Model_Results.csv')

In [69]:
test = results_df.iloc[[0]]

In [74]:
test = test.pop('Predictions')
test = test.pop('Actual')

KeyError: 'Actual'

In [75]:
test = test.T.sort_values(test.index[-1], ascending=False).T