# Analysis of Crime data in the Greater Houston Area

### Import Dependencies

In [1]:
# Import dependencies
import os 
import pandas as pd
import matplotlib as plt
import regex as re
import numpy as np
from sqlalchemy import create_engine

In [2]:
# SQL dependencies
from config import db_password
db_string = f"postgresql://bhmcd:{db_password}@crime-analysis.cnoedyl0m22c.us-east-2.rds.amazonaws.com:5432/Crime_AnalysisDB"
engine = create_engine(db_string)

In [3]:
# Import CSV file
# Using the Crime_Index_Greater_Houston_Area_2015_2020.csv 
# to have a look at just the local greater houston area if
# we want to compare local to the whole of TX
df = pd.read_csv('Resources/CSV/Crime_Index_Greater_Houston_Area_2015_2020.csv')
df.head(10)

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,BELLVILLE PD,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,SEALY ISD PD,0,0,0,1,1,5,0,7,0,2015,Austin County
2,AUSTIN CO SO,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,WALLIS PD,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,SEALY PD,0,2,0,33,52,162,5,254,6336,2015,Austin County
5,DANBURY PD (NR),0,1,0,0,0,2,0,3,1767,2015,Brazoria County
6,ALVIN COMM COLLEGE PD,0,0,0,0,0,13,1,14,0,2015,Brazoria County
7,SURFSIDE BEACH PD,0,0,0,2,4,11,4,21,544,2015,Brazoria County
8,ANGLETON ISD PD,0,1,0,1,0,16,0,18,0,2015,Brazoria County
9,SWEENY PD,0,0,1,7,23,53,1,85,3780,2015,Brazoria County


### Data Types

In [4]:
# Data types
df.dtypes

AgencyName    object
Murder         int64
Rape           int64
Robbery        int64
Assault        int64
Burglary       int64
Larceny        int64
Auto Theft     int64
Total          int64
Population     int64
Year           int64
County        object
dtype: object

In [5]:
# Assigning 'Year' column as 'object' datatype tp prevent .sum() adding years together
df = df.astype({"Year":'object'})
df.dtypes

AgencyName    object
Murder         int64
Rape           int64
Robbery        int64
Assault        int64
Burglary       int64
Larceny        int64
Auto Theft     int64
Total          int64
Population     int64
Year          object
County        object
dtype: object

### Re-naming Columns

In [6]:
df.rename(columns = {"AgencyName":"Agency_Name", "Auto Theft":"Auto_Theft", "Total":"Total_Crime"}, inplace=True)

In [7]:
df.columns

Index(['Agency_Name', 'Murder', 'Rape', 'Robbery', 'Assault', 'Burglary',
       'Larceny', 'Auto_Theft', 'Total_Crime', 'Population', 'Year', 'County'],
      dtype='object')

### Finding how many agencies reported data per year

In [8]:
#Count the number of agencies reporting per county per year
agencies_count_per_year = df.groupby(["County", "Year"]).count()
agencies_count_per_year.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,Agency_Name,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Austin County,2015,5,5,5,5,5,5,5,5,5,5
Austin County,2016,5,5,5,5,5,5,5,5,5,5
Austin County,2017,5,5,5,5,5,5,5,5,5,5
Austin County,2018,5,5,5,5,5,5,5,5,5,5
Austin County,2019,5,5,5,5,5,5,5,5,5,5
Austin County,2020,5,5,5,5,5,5,5,5,5,5


In [9]:
#Create new df to hold count
agencies_count_per_year_df = pd.DataFrame(agencies_count_per_year).reset_index()
agencies_count_per_year_df.head(6)

Unnamed: 0,County,Year,Agency_Name,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population
0,Austin County,2015,5,5,5,5,5,5,5,5,5,5
1,Austin County,2016,5,5,5,5,5,5,5,5,5,5
2,Austin County,2017,5,5,5,5,5,5,5,5,5,5
3,Austin County,2018,5,5,5,5,5,5,5,5,5,5
4,Austin County,2019,5,5,5,5,5,5,5,5,5,5
5,Austin County,2020,5,5,5,5,5,5,5,5,5,5


In [10]:
# Column names
agencies_count_per_year_df.columns

Index(['County', 'Year', 'Agency_Name', 'Murder', 'Rape', 'Robbery', 'Assault',
       'Burglary', 'Larceny', 'Auto_Theft', 'Total_Crime', 'Population'],
      dtype='object')

In [11]:
# Rename AgencyName coulumn to Agency count
agencies_count_per_year_df.rename(columns = {"Agency_Name":"Agency_Count"}, inplace=True)
agencies_count_per_year_df.head()

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population
0,Austin County,2015,5,5,5,5,5,5,5,5,5,5
1,Austin County,2016,5,5,5,5,5,5,5,5,5,5
2,Austin County,2017,5,5,5,5,5,5,5,5,5,5
3,Austin County,2018,5,5,5,5,5,5,5,5,5,5
4,Austin County,2019,5,5,5,5,5,5,5,5,5,5


In [12]:
# Drop unnecessary columns
agencies_count_per_year_df = agencies_count_per_year_df.drop(columns=['Murder', 'Rape', 'Robbery', 'Assault',
       'Burglary', 'Larceny', 'Auto_Theft', 'Total_Crime', 'Population'])
agencies_count_per_year_df.head()

Unnamed: 0,County,Year,Agency_Count
0,Austin County,2015,5
1,Austin County,2016,5
2,Austin County,2017,5
3,Austin County,2018,5
4,Austin County,2019,5


### Crime Data .groupby(county)

In [13]:
# groupby Counties
counties = df.groupby('County')
counties.head(5)

Unnamed: 0,Agency_Name,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population,Year,County
0,BELLVILLE PD,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,SEALY ISD PD,0,0,0,1,1,5,0,7,0,2015,Austin County
2,AUSTIN CO SO,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,WALLIS PD,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,SEALY PD,0,2,0,33,52,162,5,254,6336,2015,Austin County
...,...,...,...,...,...,...,...,...,...,...,...,...
422,SAN JACINTO CO SO,0,27,3,26,195,198,73,522,27895,2017,San Jacinto County
423,HUNTSVILLE PD,1,24,24,113,102,490,86,840,41634,2017,Walker County
431,BRENHAM PD,1,7,3,115,61,200,23,410,17187,2017,Washington County
568,SAN JACINTO CO SO,0,16,2,24,144,168,48,402,28457,2018,San Jacinto County


In [14]:
# Dropping AgencyName Column so only counties are listed
counties_df = df.drop(columns = ['Agency_Name'])
counties_df.head(5)

Unnamed: 0,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population,Year,County
0,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,0,0,0,1,1,5,0,7,0,2015,Austin County
2,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,0,2,0,33,52,162,5,254,6336,2015,Austin County


In [15]:
# Reorder columns
new_column_order = ["County","Murder","Rape","Assault","Burglary","Larceny","Auto_Theft","Total_Crime","Population","Year"]
counties_df = counties_df[new_column_order]
counties_df.head(5)

Unnamed: 0,County,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population,Year
0,Austin County,0,3,6,17,45,0,73,4235,2015
1,Austin County,0,0,1,1,5,0,7,0,2015
2,Austin County,0,3,13,48,61,8,135,17499,2015
3,Austin County,0,0,0,3,10,0,13,1284,2015
4,Austin County,0,2,33,52,162,5,254,6336,2015


In [16]:
county_summary = counties_df.groupby(["County", "Year"]).sum()
county_summary.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austin County,2015,0,8,53,121,283,13,482,29354
Austin County,2016,0,6,43,136,239,31,466,29718
Austin County,2017,0,11,34,98,183,45,376,29963
Austin County,2018,0,11,32,82,138,29,295,29912
Austin County,2019,0,5,38,80,147,29,302,30009


In [17]:
county_summary_df = pd.DataFrame(county_summary).reset_index()
county_summary_df.head(6)

Unnamed: 0,County,Year,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population
0,Austin County,2015,0,8,53,121,283,13,482,29354
1,Austin County,2016,0,6,43,136,239,31,466,29718
2,Austin County,2017,0,11,34,98,183,45,376,29963
3,Austin County,2018,0,11,32,82,138,29,295,29912
4,Austin County,2019,0,5,38,80,147,29,302,30009
5,Austin County,2020,3,6,33,72,133,31,281,30121


In [18]:
county_summary_df.dtypes

County         object
Year            int64
Murder          int64
Rape            int64
Assault         int64
Burglary        int64
Larceny         int64
Auto_Theft      int64
Total_Crime     int64
Population      int64
dtype: object

### Violent and NonViolent Crimes

In [19]:
# Dataframe being used
county_summary_df.head()

Unnamed: 0,County,Year,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population
0,Austin County,2015,0,8,53,121,283,13,482,29354
1,Austin County,2016,0,6,43,136,239,31,466,29718
2,Austin County,2017,0,11,34,98,183,45,376,29963
3,Austin County,2018,0,11,32,82,138,29,295,29912
4,Austin County,2019,0,5,38,80,147,29,302,30009


In [20]:
# Classifing nonviolent vs violent offenses
nonviolent_offense = county_summary_df[["Auto_Theft","Larceny","Burglary"]]
violent_offense = county_summary_df[["Rape", "Murder","Assault"]]

In [21]:
# View nonviolent_offenses
nonviolent_offense.head()

Unnamed: 0,Auto_Theft,Larceny,Burglary
0,13,283,121
1,31,239,136
2,45,183,98
3,29,138,82
4,29,147,80


In [22]:
# View violent_offenses
violent_offense.head()

Unnamed: 0,Rape,Murder,Assault
0,8,0,53
1,6,0,43
2,11,0,34
3,11,0,32
4,5,0,38


In [23]:
# Sum of nonviolent_offenses
nonviolent_offenses = nonviolent_offense.sum(axis=1)
nonviolent_offenses

0       417
1       406
2       326
3       249
4       256
       ... 
97     1095
98      918
99      729
100     797
101     804
Length: 102, dtype: int64

In [24]:
# Sum of violent_offenses
violent_offenses = violent_offense.sum(axis=1)
violent_offenses

0       61
1       49
2       45
3       43
4       43
      ... 
97     156
98     145
99     131
100    122
101    130
Length: 102, dtype: int64

In [25]:
# creating summary df
summary_df= pd.DataFrame(county_summary_df)
summary_df.head()

Unnamed: 0,County,Year,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population
0,Austin County,2015,0,8,53,121,283,13,482,29354
1,Austin County,2016,0,6,43,136,239,31,466,29718
2,Austin County,2017,0,11,34,98,183,45,376,29963
3,Austin County,2018,0,11,32,82,138,29,295,29912
4,Austin County,2019,0,5,38,80,147,29,302,30009


In [26]:
# adding column Violent and Non-Viloent offenses
summary_df['Violent_Offenses'] = violent_offenses
summary_df['NonViolent_Offenses'] = nonviolent_offenses

In [27]:
#visualizing the dataframe
summary_df.head(10)

Unnamed: 0,County,Year,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Total_Crime,Population,Violent_Offenses,NonViolent_Offenses
0,Austin County,2015,0,8,53,121,283,13,482,29354,61,417
1,Austin County,2016,0,6,43,136,239,31,466,29718,49,406
2,Austin County,2017,0,11,34,98,183,45,376,29963,45,326
3,Austin County,2018,0,11,32,82,138,29,295,29912,43,249
4,Austin County,2019,0,5,38,80,147,29,302,30009,43,256
5,Austin County,2020,3,6,33,72,133,31,281,30121,42,236
6,Brazoria County,2015,4,114,352,1149,4475,359,6555,350739,470,5983
7,Brazoria County,2016,8,110,395,996,4443,404,6511,358003,513,5843
8,Brazoria County,2017,10,113,437,947,4111,370,6079,367132,560,5428
9,Brazoria County,2018,4,131,377,800,4241,390,6050,373587,512,5431


In [28]:
# Reordering the columns
summary_df = summary_df.reindex(columns = ["County", "Year", "Murder", "Rape", "Assault", "Burglary", "Larceny",
       "Auto_Theft", "Violent_Offenses", "NonViolent_Offenses", "Total_Crime", "Population"])
summary_df.head(10)

Unnamed: 0,County,Year,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Austin County,2015,0,8,53,121,283,13,61,417,482,29354
1,Austin County,2016,0,6,43,136,239,31,49,406,466,29718
2,Austin County,2017,0,11,34,98,183,45,45,326,376,29963
3,Austin County,2018,0,11,32,82,138,29,43,249,295,29912
4,Austin County,2019,0,5,38,80,147,29,43,256,302,30009
5,Austin County,2020,3,6,33,72,133,31,42,236,281,30121
6,Brazoria County,2015,4,114,352,1149,4475,359,470,5983,6555,350739
7,Brazoria County,2016,8,110,395,996,4443,404,513,5843,6511,358003
8,Brazoria County,2017,10,113,437,947,4111,370,560,5428,6079,367132
9,Brazoria County,2018,4,131,377,800,4241,390,512,5431,6050,373587


### Create main dataframe, crime_data_df

In [29]:
houston_area_crime_data_df = pd.merge(agencies_count_per_year_df,summary_df)
houston_area_crime_data_df.head()

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Austin County,2015,5,0,8,53,121,283,13,61,417,482,29354
1,Austin County,2016,5,0,6,43,136,239,31,49,406,466,29718
2,Austin County,2017,5,0,11,34,98,183,45,45,326,376,29963
3,Austin County,2018,5,0,11,32,82,138,29,43,249,295,29912
4,Austin County,2019,5,0,5,38,80,147,29,43,256,302,30009


### Looking at just Harris County

In [30]:
# Assigning 'Year' column as 'object' datatype tp prevent .sum() adding years together
houston_area_crime_data_df = houston_area_crime_data_df.astype({"Year":'object'},{"County":'object'})
houston_area_crime_data_df.dtypes

County                 object
Year                   object
Agency_Count            int64
Murder                  int64
Rape                    int64
Assault                 int64
Burglary                int64
Larceny                 int64
Auto_Theft              int64
Violent_Offenses        int64
NonViolent_Offenses     int64
Total_Crime             int64
Population              int64
dtype: object

In [31]:
# selecting just harris county
harris_county = houston_area_crime_data_df.groupby(houston_area_crime_data_df["County"]).get_group('Harris County')

In [32]:
harris_county_df = pd.DataFrame(harris_county)
harris_county_df

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
42,Harris County,2015,42,404,1681,16599,32350,108284,21777,18684,162411,195386,4564664
43,Harris County,2016,44,409,2011,18706,30221,112200,20484,21126,162905,197686,4646508
44,Harris County,2017,44,376,2186,20660,27348,108968,18496,23222,154812,191319,4702468
45,Harris County,2018,45,383,2294,21032,25594,105155,19563,23709,150312,185574,4753437
46,Harris County,2019,45,398,2618,20005,26073,113162,20849,23021,160084,195047,4776485
47,Harris County,2020,45,566,2571,26095,24197,109103,23768,29232,157068,197590,4799254


### Removing Harris County 

### Outter Houston counties

In [33]:
outer_hou_df = houston_area_crime_data_df[houston_area_crime_data_df["County"].str.contains("Harris County")==False]
outer_hou_df.head(5)

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Austin County,2015,5,0,8,53,121,283,13,61,417,482,29354
1,Austin County,2016,5,0,6,43,136,239,31,49,406,466,29718
2,Austin County,2017,5,0,11,34,98,183,45,45,326,376,29963
3,Austin County,2018,5,0,11,32,82,138,29,43,249,295,29912
4,Austin County,2019,5,0,5,38,80,147,29,43,256,302,30009


In [34]:
outer_hou_df = pd.DataFrame(outer_hou_df)
outer_hou_df.head(5)

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Austin County,2015,5,0,8,53,121,283,13,61,417,482,29354
1,Austin County,2016,5,0,6,43,136,239,31,49,406,466,29718
2,Austin County,2017,5,0,11,34,98,183,45,45,326,376,29963
3,Austin County,2018,5,0,11,32,82,138,29,43,249,295,29912
4,Austin County,2019,5,0,5,38,80,147,29,43,256,302,30009


### Export to CSV files

In [35]:
#Export Houston Crime Data to CSV
houston_area_crime_data_df.to_csv('Resources/CSV/houston_area_crime_data.csv', index=False)

In [36]:
# Export Harris County Data to CSV
harris_county_df.to_csv('Resources/CSV/harris_crime_data.csv', index=False)

In [37]:
# Exporting outer houston county data to CSV
outer_hou_df.to_csv('Resources/CSV/outer_hou_counties.csv', index=False)

### Grabbing sample data for provisional ML model

In [38]:
sample_houston_area_crime_df = houston_area_crime_data_df.sample(frac = 0.25)
sample_houston_area_crime_df.head()

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
65,Montgomery County,2020,12,17,116,792,1109,5409,762,925,7280,8361,616505
97,Wharton County,2016,3,1,23,132,217,815,63,156,1095,1268,41398
30,Galveston County,2015,17,14,192,404,1521,6242,572,610,8335,9177,333684
6,Brazoria County,2015,21,4,114,352,1149,4475,359,470,5983,6555,350739
86,Waller County,2017,6,2,33,108,210,404,53,143,667,826,50195


In [39]:
sample_harris_df = harris_county_df.sample(frac = 0.5)
sample_harris_df.head()

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
46,Harris County,2019,45,398,2618,20005,26073,113162,20849,23021,160084,195047,4776485
42,Harris County,2015,42,404,1681,16599,32350,108284,21777,18684,162411,195386,4564664
47,Harris County,2020,45,566,2571,26095,24197,109103,23768,29232,157068,197590,4799254


In [40]:
sample_outer_hou_df = outer_hou_df.sample(frac = 0.25)
sample_outer_hou_df.head()

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
70,Polk County,2019,4,6,43,55,260,667,133,104,1060,1173,50320
19,Chambers County,2016,3,3,19,109,149,621,76,131,846,995,35244
21,Chambers County,2018,3,3,6,44,126,513,85,53,724,787,37983
28,Fort Bend County,2019,11,15,224,1007,1252,7186,569,1246,9007,10484,772362
7,Brazoria County,2016,22,8,110,395,996,4443,404,513,5843,6511,358003


### Export Sample data to SQL

In [41]:
#Export to SQL
sample_houston_area_crime_df.to_sql(name='houston_area_yearly_crime', con=engine, index=False, if_exists='replace')

In [42]:
#Export to SQL
sample_harris_df.to_sql(name='harris_crime', con=engine, index=False, if_exists='replace')

In [43]:
#Export to SQL
sample_outer_hou_df.to_sql(name='outer_houston_county_crime', con=engine, index=False, if_exists='replace')

### Export Houston DFs to SQL

In [44]:
houston_area_crime_data_df.to_sql(name='houston_area_crime_data', con=engine, index=False, if_exists='replace')

In [45]:
harris_county_df.to_sql(name='harris_county', con=engine, index=False, if_exists='replace')

In [46]:
outer_hou_df.to_sql(name='outer_houston_counties', con=engine, index=False, if_exists='replace')