In [1]:
import pandas as pd

In [2]:
# Collect metadata
train_df = pd.read_csv('crime_query.csv')
h_df = pd.read_csv('hosp_query.csv')
f_df = pd.read_csv('fire_query.csv')
pop_df = pd.read_csv('census_population.csv')

## Add metadata to train_df

In [3]:
# Crime Count by county
tmp_df = train_df.groupby(['county']).incident_id.agg('count')
tmp_df = tmp_df.to_frame().reset_index()
tmp_df = tmp_df.rename({'incident_id': 'crime_cnt'}, axis='columns')
train_df = train_df.join(tmp_df.set_index('county'), on='county')

In [4]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


In [5]:
# Hospital Count by county
tmp_df2 = h_df.groupby(['county']).hosp_id.agg('count')
tmp_df2 = tmp_df2.to_frame().reset_index()
tmp_df2 = tmp_df2.rename({'hosp_id': 'hosp_cnt'}, axis='columns')

In [6]:
tmp_df2.head()

Unnamed: 0,county,hosp_cnt
0,ANDERSON,1
1,ANDREWS,1
2,ANGELINA,2
3,ATASCOSA,1
4,AUSTIN,1


In [7]:
# Total Bed Count by county
h_df['beds'] = h_df['beds'].clip_lower(1)
tmp_df3 = h_df.groupby(['county']).beds.agg('sum')
tmp_df3 = tmp_df3.to_frame().reset_index()
tmp_df3 = tmp_df3.rename({'beds': 'bed_cnt'}, axis='columns')
h_df = tmp_df2.join(tmp_df3.set_index('county'), on='county')

In [8]:
h_df.head()

Unnamed: 0,county,hosp_cnt,bed_cnt
0,ANDERSON,1,86.0
1,ANDREWS,1,34.0
2,ANGELINA,2,420.0
3,ATASCOSA,1,67.0
4,AUSTIN,1,32.0


In [9]:
# Fire Station Count by county
tmp_df4 = f_df.groupby(['county']).fire_id.agg('count')
tmp_df4 = tmp_df4.to_frame().reset_index()
tmp_df4 = tmp_df4.rename({'fire_id': 'fire_cnt'}, axis='columns')
hf_df = tmp_df4.join(h_df.set_index('county'), on='county')

In [10]:
hf_df.head()

Unnamed: 0,county,fire_cnt,hosp_cnt,bed_cnt
0,ANDERSON,21,1.0,86.0
1,ANDREWS,1,1.0,34.0
2,ANGELINA,18,2.0,420.0
3,ARANSAS,5,,
4,ARCHER,7,,


In [11]:
# Join to train_df
train_df = train_df.join(hf_df.set_index('county'), on='county')

In [12]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


In [13]:
# Bring in census populations
pop_df['county'] = pop_df['county'].str.upper()
train_df = pd.merge(train_df, pop_df, how='outer', on=['county', 'year'])
train_df['population_description'] = pd.cut(train_df['population'], [0, 25000, 100000, 500000, 10000000],
                                            labels=['Under 25,000', '25,000 - 99,999', '100,000 - 499,999',
                                                    'Over 500,000'])

In [14]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,6.76933e+07,6.76933e+07,6.76933e+07,6.76933e+07,6.76933e+07
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 464978 entries, 0 to 464977
Data columns (total 18 columns):
incident_id               463095 non-null float64
year                      464978 non-null int64
age_num                   456081 non-null float64
victim_sex                463095 non-null object
crime_against             463095 non-null object
offense                   463095 non-null object
offense_category          463095 non-null object
location_id               463095 non-null float64
location_name             463095 non-null object
county                    464978 non-null object
officers                  463095 non-null float64
civilians                 463095 non-null float64
crime_cnt                 463095 non-null float64
fire_cnt                  463095 non-null float64
hosp_cnt                  461467 non-null float64
bed_cnt                   461467 non-null float64
population                464190 non-null float64
population_description    464190 non-null categ

In [16]:
train_df['population_description'].value_counts()

Over 500,000         262781
100,000 - 499,999    155742
25,000 - 99,999       39421
Under 25,000           6246
Name: population_description, dtype: int64