# Grouping and Summing counties to get totals

In [1]:
# Import dependencies
import os 
import pandas as pd
import matplotlib as plt
import regex as re
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from scipy.stats import poisson

In [2]:
# SQL dependencies
from config import db_password
db_string = f"postgresql://bhmcd:{db_password}@crime-analysis.cnoedyl0m22c.us-east-2.rds.amazonaws.com:5432/Crime_AnalysisDB"
engine = create_engine(db_string)

In [3]:
# Import CSV file
df = pd.read_csv('Resources/CSV/houston_area_crime_data.csv')
df.head(10)

Unnamed: 0,County,Year,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Austin County,2015,5,0,8,53,121,283,13,61,417,482,29354
1,Austin County,2016,5,0,6,43,136,239,31,49,406,466,29718
2,Austin County,2017,5,0,11,34,98,183,45,45,326,376,29963
3,Austin County,2018,5,0,11,32,82,138,29,43,249,295,29912
4,Austin County,2019,5,0,5,38,80,147,29,43,256,302,30009
5,Austin County,2020,5,3,6,33,72,133,31,42,236,281,30121
6,Brazoria County,2015,21,4,114,352,1149,4475,359,470,5983,6555,350739
7,Brazoria County,2016,22,8,110,395,996,4443,404,513,5843,6511,358003
8,Brazoria County,2017,22,10,113,437,947,4111,370,560,5428,6079,367132
9,Brazoria County,2018,21,4,131,377,800,4241,390,512,5431,6050,373587


In [4]:
# Setting column names to lowercase
df.columns= df.columns.str.lower()

In [5]:
# Get number of value counts to divide population
df.county.value_counts()

Austin County         6
Matagorda County      6
Washington County     6
Waller County         6
Walker County         6
San Jacinto County    6
Polk County           6
Montgomery County     6
Liberty County        6
Brazoria County       6
Harris County         6
Grimes County         6
Galveston County      6
Fort Bend County      6
Chambers County       6
Brazos County         6
Wharton County        6
Name: county, dtype: int64

In [6]:
# Check types
df.dtypes

county                 object
year                    int64
agency_count            int64
murder                  int64
rape                    int64
assault                 int64
burglary                int64
larceny                 int64
auto_theft              int64
violent_offenses        int64
nonviolent_offenses     int64
total_crime             int64
population              int64
dtype: object

## Grouping data by county and summing

In [7]:
# Group crimes by county and sum
houston_county_sum_df = df.groupby('county').sum()
houston_county_sum_df['population'] = houston_county_sum_df['population'].div(6).map('{:.0f}'.format)
houston_county_sum_df.head(10)

Unnamed: 0_level_0,year,agency_count,murder,rape,assault,burglary,larceny,auto_theft,violent_offenses,nonviolent_offenses,total_crime,population
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Austin County,12105,30,3,47,233,589,1123,178,283,1890,2202,29846
Brazoria County,12105,131,49,709,2417,5602,26331,2500,3175,34433,38304,369691
Brazos County,12105,24,38,895,2556,4993,24075,1886,3489,30954,35089,224101
Chambers County,12105,18,14,92,484,828,3314,493,590,4635,5300,37318
Fort Bend County,12105,66,107,1149,5471,8776,41467,3279,6727,53522,61945,739020
Galveston County,12105,101,117,1373,3026,8005,35329,4266,4516,47600,53399,347699
Grimes County,12105,12,10,55,279,743,1218,227,344,2188,2574,28081
Harris County,12105,265,2536,13361,123097,165783,656872,124937,138994,947592,1162602,4707136
Liberty County,12105,23,29,277,1160,1878,5812,1043,1466,8733,10330,74284
Matagorda County,12105,26,20,148,638,1601,4798,240,806,6639,7567,36670


In [8]:
# Rename total_crime column
houston_county_sum_df.rename(columns={'total_crime':'total_crimes'}, inplace=True)

In [9]:
# Drop the year column
houston_county_sum_df.drop(['year'], axis = 1, inplace = True)
houston_county_sum_df.head()

Unnamed: 0_level_0,agency_count,murder,rape,assault,burglary,larceny,auto_theft,violent_offenses,nonviolent_offenses,total_crimes,population
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Austin County,30,3,47,233,589,1123,178,283,1890,2202,29846
Brazoria County,131,49,709,2417,5602,26331,2500,3175,34433,38304,369691
Brazos County,24,38,895,2556,4993,24075,1886,3489,30954,35089,224101
Chambers County,18,14,92,484,828,3314,493,590,4635,5300,37318
Fort Bend County,66,107,1149,5471,8776,41467,3279,6727,53522,61945,739020


In [10]:
# Remove index column name
houston_county_sum_df.index.name = None

In [11]:
# Check final DF
houston_county_sum_df.head(10)

Unnamed: 0,agency_count,murder,rape,assault,burglary,larceny,auto_theft,violent_offenses,nonviolent_offenses,total_crimes,population
Austin County,30,3,47,233,589,1123,178,283,1890,2202,29846
Brazoria County,131,49,709,2417,5602,26331,2500,3175,34433,38304,369691
Brazos County,24,38,895,2556,4993,24075,1886,3489,30954,35089,224101
Chambers County,18,14,92,484,828,3314,493,590,4635,5300,37318
Fort Bend County,66,107,1149,5471,8776,41467,3279,6727,53522,61945,739020
Galveston County,101,117,1373,3026,8005,35329,4266,4516,47600,53399,347699
Grimes County,12,10,55,279,743,1218,227,344,2188,2574,28081
Harris County,265,2536,13361,123097,165783,656872,124937,138994,947592,1162602,4707136
Liberty County,23,29,277,1160,1878,5812,1043,1466,8733,10330,74284
Matagorda County,26,20,148,638,1601,4798,240,806,6639,7567,36670


In [12]:
# Export County Summary to CSV
houston_county_sum_df.to_csv('Resources/CSV/houston_county_totals.csv', index=True)

In [13]:
# Export to SQL DB
houston_county_sum_df.to_sql(name='houston_county_totals', con=engine, index=True, if_exists='replace')

17

## Group and sum all texas counties

In [14]:
# Import CSV file
texas_county_sum_df = pd.read_csv('Resources/CSV/crime_summary.csv')
texas_county_sum_df

Unnamed: 0,County,Year,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Anderson County,2015,9,38,195,319,749,75,242,1143,1407,57560
1,Anderson County,2016,2,44,237,259,485,94,283,838,1145,57250
2,Anderson County,2017,2,25,153,278,536,64,180,878,1078,57569
3,Anderson County,2018,1,8,96,198,531,62,105,791,907,57491
4,Anderson County,2019,2,23,100,249,514,82,125,845,984,57657
...,...,...,...,...,...,...,...,...,...,...,...,...
1507,Zavala County,2016,0,2,31,67,92,5,33,164,199,12306
1508,Zavala County,2017,0,3,52,163,203,18,55,384,444,26443
1509,Zavala County,2018,0,6,13,145,143,11,19,299,321,26263
1510,Zavala County,2019,0,4,17,41,27,5,21,73,94,11940


In [15]:
# Setting column names to lowercase
texas_county_sum_df.columns= texas_county_sum_df.columns.str.lower()

# Drop year column
texas_county_sum_df.drop('year', axis = 1, inplace = True)

In [16]:
# Sum all texas counties in one DF
texas_county_sum_df = texas_county_sum_df.groupby('county').sum()
texas_county_sum_df['population'] = texas_county_sum_df['population'].div(6).map('{:.0f}'.format)
texas_county_sum_df

Unnamed: 0_level_0,murder,rape,assault,burglary,larceny,auto_theft,violent_offenses,nonviolent_offenses,total_crime,population
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Anderson County,20,153,890,1511,3271,447,1063,5229,6390,57500
Andrews County,3,84,359,370,1491,200,446,2061,2528,18508
Angelina County,18,237,1211,3702,9420,937,1466,14059,15764,87485
Aransas County,12,130,734,2088,4826,366,876,7280,8213,32674
Archer County,0,7,31,34,95,18,38,147,186,3472
...,...,...,...,...,...,...,...,...,...,...
Wood County,4,137,281,856,1765,230,422,2851,3292,45169
Yoakum County,0,19,31,131,336,40,50,507,557,8625
Young County,5,46,120,356,761,84,171,1201,1391,18033
Zapata County,1,2,71,161,311,12,74,484,567,7112


In [17]:
# Export County Summary to CSV
texas_county_sum_df.to_csv('Resources/CSV/texas_county_totals.csv', index=True)

In [18]:
# Export to SQL DB
texas_county_sum_df.to_sql(name='texas_county_totals', con=engine, index=True, if_exists='replace')

254