<a href="https://colab.research.google.com/github/IsaacFigNewton/Analyzing-Hate-Crime-Data/blob/main/Hate_Crime_Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Stuff

###Import all libraries

In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import re

###Import datasets

In [2]:
crime_df = pd.read_csv("https://raw.githubusercontent.com/IsaacFigNewton/Analyzing-Hate-Crime-Data/main/hate_crime/hate_crime.csv", on_bad_lines='skip')
city_demo_df = pd.read_csv("https://raw.githubusercontent.com/IsaacFigNewton/Analyzing-Hate-Crime-Data/main/demographics/city/ACSST1Y2022.S0101-Data.csv", on_bad_lines='skip')
county_demo_df = pd.read_csv("https://raw.githubusercontent.com/IsaacFigNewton/Analyzing-Hate-Crime-Data/main/demographics/county/ACSDP1Y2022.DP05-Data.csv", on_bad_lines='skip')

#Data Cleaning

In [3]:
# use the entries of the first row as the column headers for easier management
def fixHeaders(df):
  new_headers = df.iloc[0]
  new_df = df[1:]
  new_df.columns = new_headers
  return new_df

In [4]:
#fix the headers
city_demo_df = fixHeaders(city_demo_df)
county_demo_df = fixHeaders(county_demo_df)

In [5]:
def split_area_name(area):
    result = [np.nan, np.nan, np.nan]
    if ", " in area:
        result = area.split(", ") + [np.nan]
    if " city" in result[0].lower():
        result[0] = result[0][0:-5]
        result[2] = result[1]
        result[1] = "City"
    elif " county" in result[0].lower():
        result[0] = result[0][0:-7]
        result[2] = result[1]
        result[1] = "County"

    return result + [np.nan] * (3 - len(result))

def splitArea(df):
    df[["pug_agency_name", "agency_type_name", "state_name"]] = df["Geographic Area Name"].map(split_area_name).apply(pd.Series)

In [6]:
splitArea(city_demo_df)
splitArea(county_demo_df)

In [7]:
#only consider 2022 crime data from cities and counties
crime_df = crime_df[(crime_df['data_year'] == 2022) & ((crime_df['agency_type_name'] == "City") | (crime_df['agency_type_name'] == "County"))]

###Combine the datasets

In [8]:
city_demo_df.head()

Unnamed: 0,Geography,Geographic Area Name,Total population,Margin of Error!!Total population,Under 5 years,Margin of Error!!Under 5 years,5 to 9 years,Margin of Error!!5 to 9 years,10 to 14 years,Margin of Error!!10 to 14 years,...,Percent Female!!Child dependency ratio,Margin of Error!!Percent Female!!Child dependency ratio,Percent Female!!PERCENT ALLOCATED!!Sex,Margin of Error!!Percent Female!!PERCENT ALLOCATED!!Sex,Percent Female!!PERCENT ALLOCATED!!Age,Margin of Error!!Percent Female!!PERCENT ALLOCATED!!Age,NaN,pug_agency_name,agency_type_name,state_name
1,1600000US0103076,"Auburn city, Alabama",80009,20,3657,1311,3809,1134,4348,1136,...,(X),(X),(X),(X),(X),(X),,Auburn,City,Alabama
2,1600000US0107000,"Birmingham city, Alabama",196353,868,11495,2020,11631,2173,9404,2476,...,(X),(X),(X),(X),(X),(X),,Birmingham,City,Alabama
3,1600000US0121184,"Dothan city, Alabama",70524,678,4549,468,5059,662,4146,592,...,(X),(X),(X),(X),(X),(X),,Dothan,City,Alabama
4,1600000US0135896,"Hoover city, Alabama",92427,45,5778,1406,5805,1516,7377,2051,...,(X),(X),(X),(X),(X),(X),,Hoover,City,Alabama
5,1600000US0137000,"Huntsville city, Alabama",222363,2301,12180,1698,13883,1837,10494,1917,...,(X),(X),(X),(X),(X),(X),,Huntsville,City,Alabama


In [9]:
county_demo_df.head()

Unnamed: 0,Geography,Geographic Area Name,Total population,Margin of Error!!Total population,Male,Margin of Error!!Male,Female,Margin of Error!!Female,Sex ratio (males per 100 females),Margin of Error!!Sex ratio (males per 100 females),...,"Percent!!CITIZEN, VOTING AGE POPULATION!!Citizen, 18 and over population","Percent Margin of Error!!CITIZEN, VOTING AGE POPULATION!!Citizen, 18 and over population","Percent!!CITIZEN, VOTING AGE POPULATION!!Citizen, 18 and over population!!Male","Percent Margin of Error!!CITIZEN, VOTING AGE POPULATION!!Citizen, 18 and over population!!Male","Percent!!CITIZEN, VOTING AGE POPULATION!!Citizen, 18 and over population!!Female","Percent Margin of Error!!CITIZEN, VOTING AGE POPULATION!!Citizen, 18 and over population!!Female",NaN,pug_agency_name,agency_type_name,state_name
1,0500000US01003,"Baldwin County, Alabama",246435,*****,120664,1277,125771,1277,95.9,2.0,...,189876,(X),48.1,0.4,51.9,0.4,,Baldwin,County,Alabama
2,0500000US01015,"Calhoun County, Alabama",115788,*****,56554,791,59234,791,95.5,2.6,...,90642,(X),47.8,0.3,52.2,0.3,,Calhoun,County,Alabama
3,0500000US01043,"Cullman County, Alabama",90665,*****,45051,622,45614,622,98.8,2.7,...,69096,(X),49.0,0.6,51.0,0.6,,Cullman,County,Alabama
4,0500000US01049,"DeKalb County, Alabama",71998,*****,36024,566,35974,566,100.1,3.2,...,52110,(X),49.6,0.9,50.4,0.9,,DeKalb,County,Alabama
5,0500000US01051,"Elmore County, Alabama",89563,*****,43448,541,46115,541,94.2,2.3,...,69952,(X),47.8,0.5,52.2,0.5,,Elmore,County,Alabama


In [10]:
# Prompts used:
# Please write a Python mapping function to convert relevant pandas dataframe column headers from the S0101 dataset to equivalent ones from the DP05 dataset.
# Please redo it, using the natural language codes for DP05 and S0101 instead of their alphanumeric codes in the map
# Thank you, please fill out the map with the remaining headers and include code for combining columns as needed

# Map the S0101 dataset's columns to those of DP05def map_dp05_to_s0101(column_name)
def preprocess_df(df_s0101):
    formatted_df = df_s0101.copy(deep=True)

    formatted_df['25 to 34 years'] = df_s0101['25 to 29 years'] + df_s0101['30 to 34 years']
    formatted_df['35 to 44 years'] = df_s0101['35 to 39 years'] + df_s0101['40 to 44 years']
    formatted_df['45 to 54 years'] = df_s0101['45 to 49 years'] + df_s0101['50 to 54 years']
    formatted_df['65 to 74 years'] = df_s0101['65 to 69 years'] + df_s0101['70 to 74 years']
    formatted_df['75 to 84 years'] = df_s0101['75 to 79 years'] + df_s0101['80 to 84 years']

    return formatted_df

city_demo_df = preprocess_df(city_demo_df)
city_demo_df

Unnamed: 0,Geography,Geographic Area Name,Total population,Margin of Error!!Total population,Under 5 years,Margin of Error!!Under 5 years,5 to 9 years,Margin of Error!!5 to 9 years,10 to 14 years,Margin of Error!!10 to 14 years,...,Margin of Error!!Percent Female!!PERCENT ALLOCATED!!Age,NaN,pug_agency_name,agency_type_name,state_name,25 to 34 years,35 to 44 years,45 to 54 years,65 to 74 years,75 to 84 years
1,1600000US0103076,"Auburn city, Alabama",80009,20,3657,1311,3809,1134,4348,1136,...,(X),,Auburn,City,Alabama,59603991,46094276,40784186,16702139,1612639
2,1600000US0107000,"Birmingham city, Alabama",196353,868,11495,2020,11631,2173,9404,2476,...,(X),,Birmingham,City,Alabama,1948315973,961512416,1099810140,119177921,51592289
3,1600000US0121184,"Dothan city, Alabama",70524,678,4549,468,5059,662,4146,592,...,(X),,Dothan,City,Alabama,48975128,42864013,30574909,46243535,23431651
4,1600000US0135896,"Hoover city, Alabama",92427,45,5778,1406,5805,1516,7377,2051,...,(X),,Hoover,City,Alabama,31505578,68735639,69805218,54203718,29162315
5,1600000US0137000,"Huntsville city, Alabama",222363,2301,12180,1698,13883,1837,10494,1917,...,(X),,Huntsville,City,Alabama,1694717466,1437114448,1309712656,116167971,77075137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
642,1600000US7210334,"Caguas zona urbana, Puerto Rico",68451,4611,2358,1088,2821,911,3158,1093,...,(X),,Caguas zona urbana,Puerto Rico,,45264659,48372873,37104384,47344547,51272382
643,1600000US7214290,"Carolina zona urbana, Puerto Rico",134842,3835,4403,276,6892,1150,5470,1144,...,(X),,Carolina zona urbana,Puerto Rico,,96279229,69768465,80338527,70388511,81476675
644,1600000US7232522,"Guaynabo zona urbana, Puerto Rico",70174,2891,2524,123,2583,776,2906,910,...,(X),,Guaynabo zona urbana,Puerto Rico,,54864915,32314987,42334670,50474655,29773154
645,1600000US7263820,"Ponce zona urbana, Puerto Rico",105979,3615,3528,581,5458,1052,5163,1123,...,(X),,Ponce zona urbana,Puerto Rico,,71326259,46036388,59085569,62667530,58704917


In [23]:
# Create new columns that combine data shared between all the city and county entries
overlapping_columns = set(city_demo_df.columns).intersection(set(county_demo_df.columns))
demo_df = pd.concat([city_demo_df[list(overlapping_columns)],county_demo_df[list(overlapping_columns)]])

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
merged_df = pd.merge(crime_df, demo_df, on=["pug_agency_name", "agency_type_name", "state_name"])

In [None]:
demo_df.head()

In [None]:
pd.options.display.max_columns = None
merged_df

In [None]:
merged_df.groupby('region_name').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
merged_df.groupby('agency_type_name').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
merged_df['total_offender_count'].plot(kind='hist', bins=20, title='total_offender_count')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
merged_df.groupby('offender_race').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
merged_df['adult_victim_count'].plot(kind='hist', bins=20, title='adult_victim_count')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
merged_df['juvenile_victim_count'].plot(kind='hist', bins=20, title='juvenile_victim_count')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
    x_label: grp['offender_race'].value_counts()
    for x_label, grp in merged_df.groupby('region_name')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('region_name')
_ = plt.ylabel('offender_race')