# Grouping and Summing counties to get totals

In [18]:
# Import dependencies
import os 
import pandas as pd
import matplotlib as plt
import regex as re
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from scipy.stats import poisson

In [19]:
# SQL dependencies
from config import db_password
db_string = f"postgresql://bhmcd:{db_password}@crime-analysis.cnoedyl0m22c.us-east-2.rds.amazonaws.com:5432/Crime_AnalysisDB"
engine = create_engine(db_string)

In [20]:
# Import CSV file
df = pd.read_csv('Resources/CSV/Crime_Index_Greater_Houston_Area_2015_2020.csv')
df.head(10)

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,BELLVILLE PD,0,3,2,6,17,45,0,73,4235,2015,Austin County
1,SEALY ISD PD,0,0,0,1,1,5,0,7,0,2015,Austin County
2,AUSTIN CO SO,0,3,2,13,48,61,8,135,17499,2015,Austin County
3,WALLIS PD,0,0,0,0,3,10,0,13,1284,2015,Austin County
4,SEALY PD,0,2,0,33,52,162,5,254,6336,2015,Austin County
5,DANBURY PD (NR),0,1,0,0,0,2,0,3,1767,2015,Brazoria County
6,ALVIN COMM COLLEGE PD,0,0,0,0,0,13,1,14,0,2015,Brazoria County
7,SURFSIDE BEACH PD,0,0,0,2,4,11,4,21,544,2015,Brazoria County
8,ANGLETON ISD PD,0,1,0,1,0,16,0,18,0,2015,Brazoria County
9,SWEENY PD,0,0,1,7,23,53,1,85,3780,2015,Brazoria County


In [21]:
# Setting column names to lowercase
df.columns= df.columns.str.lower()

In [22]:
# Get number of value counts to divide population
df.county.value_counts()

Harris County         265
Brazoria County       131
Galveston County      101
Montgomery County      74
Fort Bend County       66
Waller County          35
Austin County          30
Matagorda County       26
Brazos County          24
Polk County            24
Liberty County         23
Wharton County         20
Chambers County        18
Grimes County          12
Walker County          12
Washington County      12
San Jacinto County      6
Name: county, dtype: int64

In [23]:
# Check types
df.dtypes

agencyname    object
murder         int64
rape           int64
robbery        int64
assault        int64
burglary       int64
larceny        int64
auto theft     int64
total          int64
population     int64
year           int64
county        object
dtype: object

## Grouping data by county and summing

In [24]:
# Group crimes by county and sum
county_sum_df = df.groupby('county').sum()
county_sum_df['population'] = county_sum_df['population'].div(6).map('{:.0f}'.format)
county_sum_df.head(10)

Unnamed: 0_level_0,murder,rape,robbery,assault,burglary,larceny,auto theft,total,population,year
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austin County,3,47,29,233,589,1123,178,2202,29846,60525
Brazoria County,49,709,696,2417,5602,26331,2500,38304,369691,264296
Brazos County,38,895,646,2556,4993,24075,1886,35089,224101,48420
Chambers County,14,92,75,484,828,3314,493,5300,37318,36315
Fort Bend County,107,1149,1696,5471,8776,41467,3279,61945,739020,133155
Galveston County,117,1373,1283,3026,8005,35329,4266,53399,347699,203766
Grimes County,10,55,42,279,743,1218,227,2574,28081,24210
Harris County,2536,13361,76016,123097,165783,656872,124937,1162602,4707136,534647
Liberty County,29,277,131,1160,1878,5812,1043,10330,74284,46403
Matagorda County,20,148,122,638,1601,4798,240,7567,36670,52459


In [25]:
# Drop the year column
county_sum_df.drop(['year'], axis = 1, inplace = True)
county_sum_df.head()

Unnamed: 0_level_0,murder,rape,robbery,assault,burglary,larceny,auto theft,total,population
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austin County,3,47,29,233,589,1123,178,2202,29846
Brazoria County,49,709,696,2417,5602,26331,2500,38304,369691
Brazos County,38,895,646,2556,4993,24075,1886,35089,224101
Chambers County,14,92,75,484,828,3314,493,5300,37318
Fort Bend County,107,1149,1696,5471,8776,41467,3279,61945,739020


In [26]:
# Check final DF
county_sum_df.head()

Unnamed: 0_level_0,murder,rape,robbery,assault,burglary,larceny,auto theft,total,population
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austin County,3,47,29,233,589,1123,178,2202,29846
Brazoria County,49,709,696,2417,5602,26331,2500,38304,369691
Brazos County,38,895,646,2556,4993,24075,1886,35089,224101
Chambers County,14,92,75,484,828,3314,493,5300,37318
Fort Bend County,107,1149,1696,5471,8776,41467,3279,61945,739020


In [27]:
# Export County Summary to CSV
county_sum_df.to_csv('Resources/CSV/county_totals.csv', index=True)

In [28]:
# Export to SQL DB
county_sum_df.to_sql(name='county_totals', con=engine, index=True, if_exists='replace')

17

## Group and sum all texas counties

In [29]:
# Import CSV file
texas_county_sum_df = pd.read_csv('Resources/CSV/Texas-Crime-Index-2015-2020.csv')
texas_county_sum_df

Unnamed: 0,AgencyName,Murder,Rape,Robbery,Assault,Burglary,Larceny,Auto Theft,Total,Population,Year,County
0,FRANKSTON PD,0,2,1,6,14,31,2,56,1174,2015,Anderson County
1,ANDERSON CO SO,7,11,3,52,174,184,25,456,38087,2015,Anderson County
2,PALESTINE PD,2,25,18,137,131,534,48,895,18299,2015,Anderson County
3,ANDREWS CO SO,0,6,2,8,12,68,20,116,4413,2015,Andrews County
4,ANDREWS PD,0,12,10,56,48,245,31,402,13835,2015,Andrews County
...,...,...,...,...,...,...,...,...,...,...,...,...
6207,OLNEY PD,0,2,0,11,61,21,4,99,3080,2020,Young County
6208,YOUNG CO SO,0,4,0,4,7,14,2,31,6220,2020,Young County
6209,ZAPATA CO SO,0,0,3,13,36,72,0,124,14142,2020,Zapata County
6210,ZAVALA CO SO,1,1,0,9,12,24,8,55,4655,2020,Zavala County


In [30]:
# Setting column names to lowercase
texas_county_sum_df.columns= texas_county_sum_df.columns.str.lower()

In [31]:
# Sum all texas counties in one DF
texas_county_sum_df = texas_county_sum_df.groupby('county').sum()
texas_county_sum_df['population'] = texas_county_sum_df['population'].div(6).map('{:.0f}'.format)
texas_county_sum_df

Unnamed: 0_level_0,murder,rape,robbery,assault,burglary,larceny,auto theft,total,population,year
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Anderson County,20,153,98,890,1511,3271,447,6390,57500,36315
Andrews County,3,84,21,359,370,1491,200,2528,18508,24210
Angelina County,18,237,239,1211,3702,9420,937,15764,87485,66582
Aransas County,12,130,57,734,2088,4826,366,8213,32674,48420
Archer County,0,7,1,31,34,95,18,186,3472,12105
...,...,...,...,...,...,...,...,...,...,...
Wood County,4,137,19,281,856,1765,230,3292,45169,64564
Yoakum County,0,19,0,31,131,336,40,557,8625,24210
Young County,5,46,19,120,356,761,84,1391,18033,36315
Zapata County,1,2,9,71,161,311,12,567,7112,6054


In [32]:
# Export County Summary to CSV
texas_county_sum_df.to_csv('Resources/CSV/texas_county_totals.csv', index=True)

In [33]:
# Export to SQL DB
texas_county_sum_df.to_sql(name='texas_county_totals', con=engine, index=True, if_exists='replace')

254

Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "C:\Users\donmc\anaconda3\lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\donmc\anaconda3\lib\asyncio\selector_events.py", line 115, in _read_from_self
    data = self._ssock.recv(4096)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "C:\Users\donmc\anaconda3\lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\donmc\anaconda3\lib\asyncio\selector_events.py", line 115, in _read_from_self
    data = self._ssock.recv(4096)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed b