# Exploration of Crime data

### Import Dependencies

In [1]:
# Import dependencies
import os 
import pandas as pd
import matplotlib as plt
import regex as re


In [2]:
# Import CSV file
df = pd.read_csv('Resources/Crime_Index_Greater_Houston_Area_2015_2020.csv')
df.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,BELLVILLE PD,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,SEALY ISD PD,0,0,0,1,1,5,0,7,0,2015,Austin County
2,AUSTIN CO SO,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,WALLIS PD,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,SEALY PD,0,2,0,33,52,162,5,254,6336,2015,Austin County


### Data Types

In [3]:
# Data types
df.dtypes

AgencyName    object
Murder         int64
Rape           int64
Robbery        int64
Assault        int64
Burglary       int64
Larceny        int64
Auto Theft     int64
Total          int64
Population     int64
Year           int64
County        object
dtype: object

In [4]:
# Assigning 'Year' column as 'object' datatype tp prevent .sum() adding years together
df = df.astype({"Year":'object'})
df.dtypes

AgencyName    object
Murder         int64
Rape           int64
Robbery        int64
Assault        int64
Burglary       int64
Larceny        int64
Auto Theft     int64
Total          int64
Population     int64
Year          object
County        object
dtype: object

### Grouping data by year and making year based DataFrames

In [5]:
# .groupby Year
year_df = df.groupby('Year')
year_df.head(10)

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,BELLVILLE PD,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,SEALY ISD PD,0,0,0,1,1,5,0,7,0,2015,Austin County
2,AUSTIN CO SO,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,WALLIS PD,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,SEALY PD,0,2,0,33,52,162,5,254,6336,2015,Austin County
5,DANBURY PD (NR),0,1,0,0,0,2,0,3,1767,2015,Brazoria County
6,ALVIN COMM COLLEGE PD,0,0,0,0,0,13,1,14,0,2015,Brazoria County
7,SURFSIDE BEACH PD,0,0,0,2,4,11,4,21,544,2015,Brazoria County
8,ANGLETON ISD PD,0,1,0,1,0,16,0,18,0,2015,Brazoria County
9,SWEENY PD,0,0,1,7,23,53,1,85,3780,2015,Brazoria County


In [6]:
# 2015 df
df_2015 = df.groupby('Year').get_group(2015)
df_2015.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,BELLVILLE PD,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,SEALY ISD PD,0,0,0,1,1,5,0,7,0,2015,Austin County
2,AUSTIN CO SO,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,WALLIS PD,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,SEALY PD,0,2,0,33,52,162,5,254,6336,2015,Austin County


In [7]:
# 2016 df
df_2016 = df.groupby('Year').get_group(2016)
df_2016.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
143,BELLVILLE PD,0,3,2,4,10,59,2,80,4296,2016,Austin County
144,SEALY ISD PD,0,0,0,0,0,4,0,4,0,2016,Austin County
145,AUSTIN CO SO,0,3,3,20,52,65,22,165,17650,2016,Austin County
146,WALLIS PD,0,0,0,4,6,4,1,15,1303,2016,Austin County
147,SEALY PD,0,0,6,15,68,107,6,202,6469,2016,Austin County


In [8]:
# 2017 df
df_2017 = df.groupby('Year').get_group(2017)
df_2017.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
290,BELLVILLE PD,0,8,0,5,16,43,0,72,4304,2017,Austin County
291,SEALY ISD PD,0,0,0,0,1,3,1,5,0,2017,Austin County
292,AUSTIN CO SO,0,3,1,9,45,64,29,151,17798,2017,Austin County
293,WALLIS PD,0,0,0,2,1,4,0,7,1306,2017,Austin County
294,SEALY PD,0,0,4,18,35,69,15,141,6555,2017,Austin County


In [9]:
# 2018 df
df_2018 = df.groupby('Year').get_group(2018)
df_2018.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
436,BELLVILLE PD,0,3,0,10,5,43,5,66,4257,2018,Austin County
437,SEALY ISD PD,0,0,0,0,0,0,0,0,0,2018,Austin County
438,AUSTIN CO SO,0,6,0,8,48,36,16,114,17807,2018,Austin County
439,WALLIS PD,0,0,0,5,1,4,0,10,1304,2018,Austin County
440,SEALY PD,0,2,3,9,28,55,8,105,6544,2018,Austin County


In [10]:
# 2019 df
df_2019 = df.groupby('Year').get_group(2019)
df_2019.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
582,BELLVILLE PD,0,3,1,8,2,38,0,52,4288,2019,Austin County
583,SEALY ISD PD,0,0,0,0,0,1,0,1,0,2019,Austin County
584,AUSTIN CO SO,0,2,0,11,40,29,18,100,17806,2019,Austin County
585,SEALY PD,0,0,2,9,36,73,11,131,6593,2019,Austin County
586,WALLIS PD,0,0,0,10,2,6,0,18,1322,2019,Austin County


In [11]:
# 2020 df
df_2020 = df.groupby('Year').get_group(2020)
df_2020.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
730,BELLVILLE PD,1,0,1,15,12,30,5,64,4233,2020,Austin County
731,SEALY ISD PD,0,0,0,0,1,0,0,1,0,2020,Austin County
732,AUSTIN CO SO,1,3,1,9,18,44,17,93,18095,2020,Austin County
733,WALLIS PD,1,2,0,4,7,4,3,21,1309,2020,Austin County
734,SEALY PD,0,1,1,5,34,55,6,102,6484,2020,Austin County


### Finding how many agencies reported data per year

In [12]:
#Count the number of agencies reporting per county per year
agencies_count_per_year = df.groupby(["County", "Year"]).count()
agencies_count_per_year.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Austin County,2015,5,5,5,5,5,5,5,5,5,5
Austin County,2016,5,5,5,5,5,5,5,5,5,5
Austin County,2017,5,5,5,5,5,5,5,5,5,5
Austin County,2018,5,5,5,5,5,5,5,5,5,5
Austin County,2019,5,5,5,5,5,5,5,5,5,5


In [13]:
#Create new df to hold count
agencies_count_per_year_df = pd.DataFrame(agencies_count_per_year).reset_index()
agencies_count_per_year_df.head()

Unnamed: 0,County,Year,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population
0,Austin County,2015,5,5,5,5,5,5,5,5,5,5
1,Austin County,2016,5,5,5,5,5,5,5,5,5,5
2,Austin County,2017,5,5,5,5,5,5,5,5,5,5
3,Austin County,2018,5,5,5,5,5,5,5,5,5,5
4,Austin County,2019,5,5,5,5,5,5,5,5,5,5


In [14]:
# Column names
agencies_count_per_year_df.columns

Index(['County', 'Year', 'AgencyName', 'Murder', 'Rape', 'Robbery', 'Assault',
       'Burglary', 'Larceny', 'Auto Theft', 'Total', 'Population'],
      dtype='object')

In [15]:
# Rename AgencyName coulumn to Agency count
agencies_count_per_year_df.rename(columns = {"AgencyName":"AgencyCount"}, inplace=True)
agencies_count_per_year_df.head()

Unnamed: 0,County,Year,AgencyCount,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population
0,Austin County,2015,5,5,5,5,5,5,5,5,5,5
1,Austin County,2016,5,5,5,5,5,5,5,5,5,5
2,Austin County,2017,5,5,5,5,5,5,5,5,5,5
3,Austin County,2018,5,5,5,5,5,5,5,5,5,5
4,Austin County,2019,5,5,5,5,5,5,5,5,5,5


In [16]:
# Drop unnecessary columns
agencies_count_per_year_df = agencies_count_per_year_df.drop(columns=['Murder', 'Rape', 'Robbery', 'Assault',
       'Burglary', 'Larceny', 'Auto Theft', 'Total', 'Population'])
agencies_count_per_year_df.head()

Unnamed: 0,County,Year,AgencyCount
0,Austin County,2015,5
1,Austin County,2016,5
2,Austin County,2017,5
3,Austin County,2018,5
4,Austin County,2019,5


### Crime Data by County

In [17]:
# groupby Counties
counties = df.groupby('County')
counties.head()

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,BELLVILLE PD,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,SEALY ISD PD,0,0,0,1,1,5,0,7,0,2015,Austin County
2,AUSTIN CO SO,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,WALLIS PD,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,SEALY PD,0,2,0,33,52,162,5,254,6336,2015,Austin County
...,...,...,...,...,...,...,...,...,...,...,...,...
422,SAN JACINTO CO SO,0,27,3,26,195,198,73,522,27895,2017,San Jacinto County
423,HUNTSVILLE PD,1,24,24,113,102,490,86,840,41634,2017,Walker County
431,BRENHAM PD,1,7,3,115,61,200,23,410,17187,2017,Washington County
568,SAN JACINTO CO SO,0,16,2,24,144,168,48,402,28457,2018,San Jacinto County


In [18]:
# Dropping AgencyName Column so only counties are listed
counties_df = df.drop(columns = ['AgencyName'])
counties_df

Unnamed: 0,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,0,0,0,1,1,5,0,7,0,2015,Austin County
2,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,0,2,0,33,52,162,5,254,6336,2015,Austin County
...,...,...,...,...,...,...,...,...,...,...,...
874,1,8,3,31,26,65,10,144,17904,2020,Washington County
875,0,1,3,54,77,174,36,345,21310,2020,Wharton County
876,0,1,5,18,38,109,13,184,8612,2020,Wharton County
877,0,0,0,0,0,0,0,0,0,2020,Wharton County


In [19]:
# Reorder columns
new_column_order = ["County","Murder","Rape","Assault","Burglary","Larceny","Auto Theft","Total","Population","Year"]
counties_df = counties_df[new_column_order]
counties_df

Unnamed: 0,County,Murder,Rape,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year
0,Austin County,0,3,6,17,45,0,73,4235,2015
1,Austin County,0,0,1,1,5,0,7,0,2015
2,Austin County,0,3,13,48,61,8,135,17499,2015
3,Austin County,0,0,0,3,10,0,13,1284,2015
4,Austin County,0,2,33,52,162,5,254,6336,2015
...,...,...,...,...,...,...,...,...,...,...
874,Washington County,1,8,31,26,65,10,144,17904,2020
875,Wharton County,0,1,54,77,174,36,345,21310,2020
876,Wharton County,0,1,18,38,109,13,184,8612,2020
877,Wharton County,0,0,0,0,0,0,0,0,2020


In [20]:
county_summary = counties_df.groupby(["County", "Year"]).sum()
county_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Murder,Rape,Assault,Burglary,Larceny,Auto Theft,Total,Population
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austin County,2015,0,8,53,121,283,13,482,29354
Austin County,2016,0,6,43,136,239,31,466,29718
Austin County,2017,0,11,34,98,183,45,376,29963
Austin County,2018,0,11,32,82,138,29,295,29912
Austin County,2019,0,5,38,80,147,29,302,30009
...,...,...,...,...,...,...,...,...,...
Wharton County,2016,1,23,132,217,815,63,1268,41398
Wharton County,2017,2,16,127,210,646,62,1086,41762
Wharton County,2018,3,17,111,163,527,39,868,41950
Wharton County,2019,1,18,103,177,559,61,944,41400


In [21]:
county_summary_df = pd.DataFrame(county_summary).reset_index()
county_summary_df.head(6)

Unnamed: 0,County,Year,Murder,Rape,Assault,Burglary,Larceny,Auto Theft,Total,Population
0,Austin County,2015,0,8,53,121,283,13,482,29354
1,Austin County,2016,0,6,43,136,239,31,466,29718
2,Austin County,2017,0,11,34,98,183,45,376,29963
3,Austin County,2018,0,11,32,82,138,29,295,29912
4,Austin County,2019,0,5,38,80,147,29,302,30009
5,Austin County,2020,3,6,33,72,133,31,281,30121


In [22]:
county_summary_df.dtypes

County        object
Year           int64
Murder         int64
Rape           int64
Assault        int64
Burglary       int64
Larceny        int64
Auto Theft     int64
Total          int64
Population     int64
dtype: object

### Export to CSV file

In [23]:
# Export County Summary to CSV
county_summary_df.to_csv('Resources/county_summaries_by_year.csv', index=False)

In [24]:
# Export Agencies Summay to CSV
agencies_count_per_year_df.to_csv('Resources/reporting_agencies_per_year.csv', index=False)