The crime information in Houston can be obtained here:
https://www.houstontx.gov/police/cs/Monthly_Crime_Data_by_Street_and_Police_Beat.htm

#### Import Libraries

In [1]:
import pandas as pd
import csv
import warnings
import re
import math
import os

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
input_file = "./resources/Houston_TX_Crime_Stats_2022.xlsx"

# Create the input DF
df = pd.read_excel(input_file)

In [3]:
# Drop na calues in zipCode column
df = df.dropna(subset=('ZIPCode'))

In [4]:
# Drop unused columns
df = df[['NIBRSClass', 'NIBRSDescription', 'Premise', 'ZIPCode']]

#### Analyze the data

In [5]:
zip_count = df['ZIPCode'].nunique(dropna=True)
print(f'There are {zip_count} zip codes in this table')

There are 185 zip codes in this table


In [6]:
# Get a count of unique values for each column
counts = df.nunique()
counts

NIBRSClass           60
NIBRSDescription     61
Premise              45
ZIPCode             185
dtype: int64

#### Calculate the crime rate for every Zip Code in Houston

In [20]:
# Create an empty DF, only declare colun names
cnames = ['zip', 'crimeCount', 'crimePercentage']
df_crimeByZip = pd.DataFrame(columns=cnames)

length = len(df)

crimeZip = df['ZIPCode'].unique().tolist()

for i in crimeZip:
    zipType = type(i)
    
    if zipType is str and len(i) > 5:
        regex = '(\d{5})[-—–]'
        match = re.findall(rf"{regex}", i)
        zipOK = int(''.join(match))
    elif zipType is float:
        zipOK = math.ceil(i)
    else:
        zipOK = i
    
    count = df['ZIPCode'].value_counts()[i]
    percent = (count / length) * 100
    

    df_row = {'zip': zipOK, 'crimeCount': count, 'crimePercentage': percent}

    # Append to the crime DF
    df_crimeByZip = df_crimeByZip.append(df_row, ignore_index=True)
    
df_crimeByZip['zip'] = df_crimeByZip['zip'].astype('int')
df_crimeByZip['crimeCount'] = df_crimeByZip['crimeCount'].astype('int')

#### Save to a csv file

In [21]:
ou_name = 'Houston_TX_Crime_Stats_per_ZipCode_2022.csv'

# Create output directory if it does not exist
ou_path = './data/'
os.makedirs(ou_path, exist_ok=True)

# Save to a csv file
df_crimeByZip.to_csv(f'{ou_path}{ou_name}', index=False)