# User Inputs

Find the best way to provide the ratio features to predict on based on the user's other selections

In [1]:
import pandas as pd

### Read in query outputs to dataframes

In [2]:
crime_df = pd.read_csv('crime_query.csv')
h_df = pd.read_csv('hosp_query.csv')
f_df = pd.read_csv('fire_query.csv')
pop_df = pd.read_csv('census_population.csv')

### Get crime count by county and join to crime_df

In [3]:
tmp_df4 = crime_df.groupby(['county']).incident_id.agg('count')
tmp_df4 = tmp_df4.to_frame().reset_index()
tmp_df4 = tmp_df4.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df4

Unnamed: 0,county,crime_cnt
0,BELL,10240
1,BEXAR,35
2,BRAZORIA,9487
3,BREWSTER,61
4,BURNET,1303
5,CALHOUN,1634
6,CAMERON,3
7,CHEROKEE,396
8,COKE,2
9,COLLIN,55442


In [4]:
crime_df = crime_df.join(tmp_df4.set_index('county'), on='county')

### Get hospital and bed counts by county and join to h_df

In [5]:
h_df.head()

Unnamed: 0,county,hosp_id,beds
0,HARRIS,25577030,1082.0
1,JONES,3379501,45.0
2,COLLIN,475013,73.0
3,POTTER,679106,451.0
4,BRAZORIA,1177515,64.0


In [6]:
h_df['beds'] = h_df['beds'].clip_lower(1)

In [7]:
tmp_df = h_df.groupby(['county']).hosp_id.agg('count')
tmp_df = tmp_df.to_frame().reset_index()
tmp_df = tmp_df.rename({'hosp_id': 'hosp_cnt'}, axis='columns')

In [8]:
tmp_df2 = h_df.groupby(['county']).beds.agg('sum')
tmp_df2 = tmp_df2.to_frame().reset_index()
tmp_df2 = tmp_df2.rename({'beds': 'bed_cnt'}, axis='columns')

In [9]:
h_df = tmp_df.join(tmp_df2.set_index('county'), on='county')

### Get fire station count by county and join to f_df

In [10]:
f_df.head()

Unnamed: 0,county,fire_id
0,EL PASO,10410064
1,EL PASO,10410225
2,EL PASO,10139838
3,EL PASO,10139423
4,EL PASO,10410065


In [11]:
tmp_df3 = f_df.groupby(['county']).fire_id.agg('count')
tmp_df3 = tmp_df3.to_frame().reset_index()
tmp_df3 = tmp_df3.rename({'fire_id': 'fire_cnt'}, axis='columns')

### Merge hospital dataframe and fire station dataframe

In [12]:
hf_df = tmp_df3.join(h_df.set_index('county'), on='county')

### Join to training dataframe

In [13]:
train_df = crime_df.join(hf_df.set_index('county'), on='county')

In [14]:
train_df.head(3).T

Unnamed: 0,0,1,2
incident_id,67693343,67693344,67693345
year,2013,2013,2013
age_num,46,57,51
victim_sex,F,M,F
crime_against,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses
location_id,20,18,18
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage
county,POTTER,POTTER,POTTER


In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463095 entries, 0 to 463094
Data columns (total 16 columns):
incident_id         463095 non-null int64
year                463095 non-null int64
age_num             456081 non-null float64
victim_sex          463095 non-null object
crime_against       463095 non-null object
offense             463095 non-null object
offense_category    463095 non-null object
location_id         463095 non-null int64
location_name       463095 non-null object
county              463095 non-null object
officers            463095 non-null int64
civilians           463095 non-null int64
crime_cnt           463095 non-null int64
fire_cnt            463095 non-null int64
hosp_cnt            461467 non-null float64
bed_cnt             461467 non-null float64
dtypes: float64(3), int64(7), object(6)
memory usage: 56.5+ MB


### Bring in population feature

In [16]:
pop_df['population_description'] = pd.cut(pop_df['population'],
                                          [0, 10000, 50000, 100000, 500000, 1000000, 10000000],
                                          labels=['Under 10,000', '10,000 - 49,999', '50,000 - 99,999', 
                                                  '100,000 - 499,999', '500,000 - 1,000,000', 'Over 1 million'])


In [17]:
pop_df.head(10)

Unnamed: 0,county,year,population,population_description
0,ANDERSON,2010,58477,"50,000 - 99,999"
1,ANDERSON,2011,58379,"50,000 - 99,999"
2,ANDERSON,2012,58036,"50,000 - 99,999"
3,ANDERSON,2013,57960,"50,000 - 99,999"
4,ANDERSON,2014,57837,"50,000 - 99,999"
5,ANDERSON,2015,57641,"50,000 - 99,999"
6,ANDERSON,2016,57558,"50,000 - 99,999"
7,ANDERSON,2017,57741,"50,000 - 99,999"
8,ANDREWS,2010,14817,"10,000 - 49,999"
9,ANDREWS,2011,15386,"10,000 - 49,999"


In [18]:
pop_df['population_description'].value_counts()

10,000 - 49,999        820
Under 10,000           677
100,000 - 499,999      225
50,000 - 99,999        191
500,000 - 1,000,000     47
Over 1 million          40
Name: population_description, dtype: int64

In [19]:
train_df =pd.merge(train_df, pop_df, how = 'outer', on=['county','year'])
train_df.head()

Unnamed: 0,incident_id,year,age_num,victim_sex,crime_against,offense,offense_category,location_id,location_name,county,officers,civilians,crime_cnt,fire_cnt,hosp_cnt,bed_cnt,population,population_description
0,67693343.0,2013,46.0,F,Property,All Other Larceny,Larceny/Theft Offenses,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
1,67693344.0,2013,57.0,M,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
2,67693345.0,2013,51.0,F,Property,Theft From Motor Vehicle,Larceny/Theft Offenses,18.0,Parking/Drop Lot/Garage,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
3,67693346.0,2013,24.0,M,Property,All Other Larceny,Larceny/Theft Offenses,5.0,Commercial/Office Building,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"
4,67693348.0,2013,49.0,M,Property,Burglary/Breaking & Entering,Burglary/Breaking & Entering,20.0,Residence/Home,POTTER,343.0,68.0,47721.0,12.0,5.0,878.0,122088.0,"100,000 - 499,999"


### Drop rows without an incident_id

In [None]:
train_df.dropna(axis=0, subset=['incident_id'], inplace=True)
train_df.head()

In [None]:
train_df.info()

### Duplicate feature engineering for model training except dummyization

In [None]:
# Drop NA in victim age column (<2%)
train_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
train_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
train_df.dropna(axis=0, subset=['population'], inplace=True)

# Drop offense_category_Gambling Offenses
train_df = train_df[train_df.offense_category != 'Gambling Offenses']

# Create Ratio Columns
train_df['crime_pop_ratio'] = train_df['crime_cnt'] / train_df['population']
train_df['beds_pop_ratio'] = train_df['bed_cnt'] / train_df['population']
train_df['beds_crime_ratio'] = train_df['bed_cnt'] / train_df['crime_cnt']
train_df['fire_pop_ratio'] = train_df['fire_cnt'] / train_df['population']
train_df['fire_crime_ratio'] = train_df['fire_cnt'] / train_df['crime_cnt']

# Reduce df to only desired features to train/test model
train_df = train_df[['age_num', 'victim_sex', 'offense_category', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

In [None]:
train_df['population_description'].value_counts()

In [None]:
train_df['crime_pop_ratio'].describe()

In [None]:
officers_avg = train_df.groupby(['population_description']).officers.agg('mean')
officers_avg

In [None]:
civilians_avg = train_df.groupby(['population_description']).civilians.agg('mean')
civilians_avg

In [None]:
rural = train_df[train_df.population_description == 'Under 10,000']
suburban = train_df[train_df.population_description == '10,000 - 24,999']
urban = train_df[train_df.population_description == '25,000 - 99,999']
metro = train_df[train_df.population_description == 'Over 100,000']

In [None]:
rural_cp_rat = rural['crime_pop_ratio'].quantile(.2)
rural_bp_rat = rural['beds_pop_ratio'].quantile(.8)
rural_bc_rat = rural['beds_crime_ratio'].quantile(.8)
rural_fp_rat = rural['fire_pop_ratio'].quantile(.8)
rural_fc_rat = rural['fire_crime_ratio'].quantile(.8)

In [None]:
print(rural_cp_rat, rural_bp_rat, rural_bc_rat, rural_fp_rat, rural_fc_rat)

In [None]:
suburban_cp_rat = suburban['crime_pop_ratio'].quantile(.2)
suburban_bp_rat = suburban['beds_pop_ratio'].quantile(.8)
suburban_bc_rat = suburban['beds_crime_ratio'].quantile(.8)
suburban_fp_rat = suburban['fire_pop_ratio'].quantile(.8)
suburban_fc_rat = suburban['fire_crime_ratio'].quantile(.8)

In [None]:
print(suburban_cp_rat, suburban_bp_rat, suburban_bc_rat, suburban_fp_rat, suburban_fc_rat)

In [None]:
urban_cp_rat = urban['crime_pop_ratio'].quantile(.2)
urban_bp_rat = urban['beds_pop_ratio'].quantile(.8)
urban_bc_rat = urban['beds_crime_ratio'].quantile(.8)
urban_fp_rat = urban['fire_pop_ratio'].quantile(.8)
urban_fc_rat = urban['fire_crime_ratio'].quantile(.8)

In [None]:
print(urban_cp_rat, urban_bp_rat, urban_bc_rat, urban_fp_rat, urban_fc_rat)

In [None]:
metro_cp_rat = metro['crime_pop_ratio'].quantile(.2)
metro_bp_rat = metro['beds_pop_ratio'].quantile(.8)
metro_bc_rat = metro['beds_crime_ratio'].quantile(.8)
metro_fp_rat = metro['fire_pop_ratio'].quantile(.8)
metro_fc_rat = metro['fire_crime_ratio'].quantile(.8)

In [None]:
print(metro_cp_rat, metro_bp_rat, metro_bc_rat, metro_fp_rat, metro_fc_rat)

In [None]:
import numpy as np

user_age = 30
user_sex = 'F'
user_offense = 'Arson'
user_population = 'Under 10,000'

rural_dict = {'beds_crime': 1.0465116279069768, 'beds_pop': 0.011503067484662576, 'crime_pop': 0.003339903635567236,
              'fire_crime': 0.04918032786885246, 'fire_pop': 0.0005112474437627812, 'officers': 11.158416,
              'civilians': 2.217822}

suburban_dict = {'beds_crime': 0.7780734560797852, 'beds_pop': 0.005645964795748921, 'crime_pop': 0.0288608435735636,
                 'fire_crime': 0.17011891062523973, 'fire_pop': 0.0004335933966854009, 'officers': 16.167896,
                 'civilians': 7.837446}

urban_dict = {'beds_crime': 0.36263914090106647, 'beds_pop': 0.007158590625022616, 'crime_pop': 0.023604314269714897,
              'fire_crime': 0.026509040747660376, 'fire_pop': 0.00031802378474501665, 'officers': 69.365510,
              'civilians': 36.503884}

metro_dict = {'beds_crime': 208.19382486575793, 'beds_pop': 0.00722454368291119, 'crime_pop': 0.01528558395906832,
              'fire_crime': 3.3349446614583336, 'fire_pop': 0.0001755805639960661, 'officers': 207.098791,
              'civilians': 59.745162}

predict_dict = {'age_num': 0, 'officers': 1, 'civilians': 2, 'crime_pop_ratio': 3, 'beds_pop_ratio': 4,
    'beds_crime_ratio': 5, 'fire_pop_ratio': 6, 'fire_crime_ratio': 7, 'victim_sex_F': 8, 'victim_sex_M': 9,
    'victim_sex_U': 10, 'Arson': 11, 'Assault Offenses': 12, 'Bribery': 13, 'Burglary/Breaking & Entering': 14,
    'Counterfeiting/Forgery': 15, 'Destruction/Damage/Vandalism of Property': 16, 'Drug/Narcotic Offenses': 17,
    'Embezzlement': 18, 'Extortion/Blackmail': 19, 'Fraud Offenses': 20, 'Homicide Offenses': 21,
    'Human Trafficking': 22, 'Kidnapping/Abduction': 23, 'Larceny/Theft Offenses':24, 'Motor Vehicle Theft': 25,
    'Pornography/Obscene Material': 26, 'Prostitution Offenses': 27, 'Robbery': 28, 'Sex Offenses': 29,
    'Stolen Property Offenses': 30, 'Weapon Law Violations': 31, 'Under 10,000': 32, '10,000 - 24,999': 33,
    '25,000 - 99,999': 34, 'Over 100,000': 35}

user_test = np.zeros(36)

In [None]:
## Create user specific test array
user_test[0] = user_age
if user_sex == 'F':
    user_test[8] = 1
else:
    user_test[9] = 1
    
user_test[predict_dict[user_offense]] = 1

if user_population == 'Under 10,000':
    user_test[1] = rural_dict['officers']
    user_test[2] = rural_dict['civilians']
    user_test[3] = rural_dict['crime_pop']
    user_test[4] = rural_dict['beds_pop']
    user_test[5] = rural_dict['beds_crime']
    user_test[6] = rural_dict['fire_pop']
    user_test[7] = rural_dict['fire_crime']
    user_test[32] = 1
elif user_population == '10,000 - 24,999':
    user_test[1] = suburban_dict['officers']
    user_test[2] = suburban_dict['civilians']
    user_test[3] = suburban_dict['crime_pop']
    user_test[4] = suburban_dict['beds_pop']
    user_test[5] = suburban_dict['beds_crime']
    user_test[6] = suburban_dict['fire_pop']
    user_test[7] = suburban_dict['fire_crime']
    user_test[33] = 1
elif user_population == '25,000 - 99,999':
    user_test[1] = urban_dict['officers']
    user_test[2] = urban_dict['civilians']
    user_test[3] = urban_dict['crime_pop']
    user_test[4] = urban_dict['beds_pop']
    user_test[5] = urban_dict['beds_crime']
    user_test[6] = urban_dict['fire_pop']
    user_test[7] = urban_dict['fire_crime']
    user_test[34] = 1
else:
    user_test[1] = metro_dict['officers']
    user_test[2] = metro_dict['civilians']
    user_test[3] = metro_dict['crime_pop']
    user_test[4] = metro_dict['beds_pop']
    user_test[5] = metro_dict['beds_crime']
    user_test[6] = metro_dict['fire_pop']
    user_test[7] = metro_dict['fire_crime']
    user_test[35] = 1


In [None]:
user_test

In [None]:
metro['fire_pop_ratio'].value_counts()

In [None]:
153415