# Crime and Housing

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# load data sets
crime = pd.read_csv('crime-housing-austin-2015.csv')
population = pd.read_csv('AustinZipCodes.csv')

# clean columns and set to float data types
population['Population'] = population['Population'].str.replace(",", "").astype('float')
population['People / Sq. Mile'] = population['People / Sq. Mile'].str.replace(",", "").astype('float')
population[['Latitude', 'Longitude']] = population['Location'].str.split(',', n=1, expand=True).astype('float')

# remove rows where no Zip Code is given and convert to integers
crime = crime.dropna(subset='Zip_Code_Crime')
crime['Zip_Code_Crime'] = crime['Zip_Code_Crime'].astype(np.int64)

# count the number of crimes per zip code
crime_counts = crime.groupby('Zip_Code_Crime', as_index=False).agg({'Key':'count', 
                                                     'Medianhouseholdincome':'first', 
                                                     'Medianrent':'first', 
                                                     'Medianhomevalue':'first'}).rename(columns={'Key':'Count'})

# merge crime and population with demographics
crime_rate = pd.merge(crime_counts, population, left_on="Zip_Code_Crime", right_on='Zip Code', how='inner')
crime_rate['Crime_Rate'] = crime_rate.Count / crime_rate.Population * 100000

# # clean columns and set to float data types
crime_rate['Medianhomevalue'] = crime_rate['Medianhomevalue'].str.replace("$", "").astype('float')
crime_rate['Medianrent'] = crime_rate['Medianrent'].str.replace("$", "").astype('float')
crime_rate['Medianhouseholdincome'] = crime_rate['Medianhouseholdincome'].str.replace("$", "").astype('float')
crime_rate.head(10)

# Overall Crime Rate Correlated with Financial Health

# T-Test Comparisons of Low vs High Income Zip Codes

To divide Zip Codes into lower or higher income areas, we used the Department of Housing and Urban Development's (HUD) definition of lower income, which is below 80% of the median income for the metropolitain area. The median household income for Austin, TX in 2015 was \\$57,689, which sets our threshold at \\$46,151.20. The median household income of each Zip Code was then compared as either above or below this threshold. 

In [None]:
crime_rate['relativeIncome'] = np.where(crime_rate['Medianhouseholdincome'] > 46151.2, 'above', 'below')
crime_rate.value_counts('relativeIncome')

## Where are the approximate locations of Above income or Below income?

In [None]:
sns.scatterplot(data=crime_rate, x='Longitude', y='Latitude', hue='relativeIncome')

In [None]:
relIncome_crime_dis = sns.kdeplot(data=crime_rate[crime_rate.Zip_Code_Crime != 78701], x='Crime_Rate', hue='relativeIncome')
relIncome_crime_dis.set(title='Distribution of Crime Rate by Median Household Income', xlabel = 'Crime Rate')
relIncome_crime_dis.get_legend().set_title('Above/Below HUD Threshold') 
plt.savefig("relativeIncome.png", bbox_inches='tight')

In [None]:
high_income = crime_rate.loc[(crime_rate.Zip_Code_Crime != 78701) & (crime_rate.relativeIncome == 'above'), 'Crime_Rate']
low_income = crime_rate.loc[(crime_rate.Zip_Code_Crime != 78701) & (crime_rate.relativeIncome == 'below'), 'Crime_Rate']
stats.ttest_ind(high_income, low_income)

We have some evidence that the crime rate for Zip Codes with lower income is higher than for Zip Codes with lower incomes. 

## How big is the difference?

In [None]:
crime_rate.loc[crime_rate.Zip_Code_Crime != 78701, :].groupby('relativeIncome').agg({'Crime_Rate': ['min', 'max', 'mean', 'median', 'std']})

For 2015, the difference measured is about 1808 crimes per 100,000 people.