# Crime and Housing

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# load data sets
crime = pd.read_csv('crime-housing-austin-2015.csv')
population = pd.read_csv('AustinZipCodes.csv')

# clean columns and set to float data types
population['Population'] = population['Population'].str.replace(",", "").astype('float')
population['People / Sq. Mile'] = population['People / Sq. Mile'].str.replace(",", "").astype('float')
population[['Latitude', 'Longitude']] = population['Location'].str.split(',', n=1, expand=True).astype('float')

# remove rows where no Zip Code is given and convert to integers
crime = crime.dropna(subset='Zip_Code_Crime')
crime['Zip_Code_Crime'] = crime['Zip_Code_Crime'].astype(np.int64)

# count the number of crimes per zip code
crime_counts = crime.groupby('Zip_Code_Crime', as_index=False).agg({'Key':'count', 
                                                     'Medianhouseholdincome':'first', 
                                                     'Medianrent':'first', 
                                                     'Medianhomevalue':'first'}).rename(columns={'Key':'Count'})

# merge crime and population with demographics
crime_rate = pd.merge(crime_counts, population, left_on="Zip_Code_Crime", right_on='Zip Code', how='inner')
crime_rate['Crime_Rate'] = crime_rate.Count / crime_rate.Population * 100000

# # clean columns and set to float data types
crime_rate['Medianhomevalue'] = crime_rate['Medianhomevalue'].str.replace("$", "").astype('float')
crime_rate['Medianrent'] = crime_rate['Medianrent'].str.replace("$", "").astype('float')
crime_rate['Medianhouseholdincome'] = crime_rate['Medianhouseholdincome'].str.replace("$", "").astype('float')
crime_rate.head(10)

# Distribution of Crime Rate

Here is a brief visual overview of our data.

In [None]:
hist_crimeRate = sns.histplot(data=crime_rate, x = "Crime_Rate")
hist_crimeRate.set(title = "Histogram of Zip Code's Crime Rate", xlabel="Crimes Reported Per 100,000 People", ylabel ="Number of Zip Codes")
# hist_crimeRate.get_figure().savefig('histogram_crimeRate.png')

In [None]:
bp_crimeRate = sns.boxplot(data=crime_rate, x = "Crime_Rate")
bp_crimeRate.set(title = "Boxplot of Zip Code's Crime Rate", xlabel="Crimes Reported Per 100,000 People")
# bp_crimeRate.get_figure().savefig('bp_crimeRate.png')

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(12,2), sharey=True)

ax = plt.subplot(1,3,1)
ax.scatter(x=crime_rate.Medianhouseholdincome, y=crime_rate.Crime_Rate, label='Crime Rate vs. Median Household Income')
ax.set_xlabel('Median Household Income')    
ax.set_ylabel('Crime Rate')    

ax = plt.subplot(1,3,2)
ax.scatter(x=crime_rate.Medianrent, y=crime_rate.Crime_Rate, label='Crime Rate vs. Median Rent')
ax.set_xlabel('Median Rent')    

ax = plt.subplot(1,3,3)
ax.scatter(x=crime_rate.Medianhomevalue, y=crime_rate.Crime_Rate, label='Crime Rate vs. Median Home Value')
ax.set_xlabel('Median Home Value')    

In [None]:
crime_rate.sort_values('Crime_Rate', ascending=False).head(5)

Zip Code 78701 is the very center of the city and is a major outlier in the normalized crime rates as seen by the boxplot and histogram. Several reasons could be contributing to this Zip Code's very high crime rate, including:
* Relatively lower population in the city center
* Higher percentage of businesses and shops as locations for theft and other crime
* Higher presence of tourists, which can make easy targets for theft and other crimes
* Possible presence of gangs or other organized crime

Due to the nature of this Zip Code and its outlying crime rate, we decided to remove it from further anaysis and trends we observed.

In [None]:
# remove ZIP Code 78701 as outlier
crime_rate = crime_rate.loc[crime_rate.Zip_Code_Crime != 78701, :]

# Distribution after removing the outlier

In [None]:
hist_crimeRate_dropoutlier = sns.histplot(data=crime_rate, x = "Crime_Rate")
hist_crimeRate_dropoutlier.set(title = "Histogram of Zip Code's Crime Rate (Excluding 78701)", xlabel="Crimes Reported Per 100,000 People", ylabel ="Number of Zip Codes")
# hist_crimeRate_dropoutlier.get_figure().savefig('histogram_crimeRate_dropoutlier.png')

In [None]:
bp_crimeRate_dropoutlier = sns.boxplot(data=crime_rate, x = "Crime_Rate")
bp_crimeRate_dropoutlier.set(title = "Boxplot of Zip Code's Crime Rate (Excluding 78701)", xlabel="Crimes Reported Per 100,000 People")
# bp_crimeRate_dropoutlier.get_figure().savefig('bp_crimeRate_dropoutlier.png')

# Overall Crime Rate Correlated with Financial Health

We decided to compare Crime Rate to three finanical well-being metrics:
* Median Household Income
* Median Rent
* Median Home Value

These were plotted with a linear regression line & we cacluated the Pearson r correlation coefficient & p-value to determine the strength of the relationship.

In [None]:
# drop NaN value in financial health metrics
crime_rate_dropna = crime_rate.dropna(subset='Medianhouseholdincome')
crime_rate_dropna = crime_rate_dropna.dropna(subset='Medianrent')
crime_rate_dropna = crime_rate_dropna.dropna(subset='Medianhomevalue')
crime_rate_dropna.head()

In [None]:
# these are the ZIP codes that were dropped, in case you're curious
crime_rate[crime_rate.isnull().any(axis=1)]

## Crime Rate vs. Median Household Income

In [None]:
crime_vs_income = sns.regplot(data=crime_rate_dropna, x='Medianhouseholdincome', y='Crime_Rate')
crime_vs_income.set(title='Crime Rate vs. Median Household Income', xlabel='Median Household Income', ylabel='Crime Rate (per 100,000 people)')
stats.pearsonr(crime_rate_dropna.Medianhouseholdincome, crime_rate_dropna.Crime_Rate)

## Crime Rate vs. Median Rent

In [None]:
crime_vs_rent = sns.regplot(data=crime_rate_dropna, x='Medianrent', y='Crime_Rate')
crime_vs_rent.set(title='Crime Rate vs. Median Rent', xlabel='Median Rent', ylabel='Crime Rate (per 100,000 people)')
stats.pearsonr(crime_rate_dropna.Medianrent, crime_rate_dropna.Crime_Rate)

## Crime Rate vs. Median Home Value

In [None]:
crime_vs_home_value = sns.regplot(data=crime_rate_dropna, x='Medianhomevalue', y='Crime_Rate')
crime_vs_home_value.set(title='Crime Rate vs. Median Home Value', xlabel='Median Home Value', ylabel='Crime Rate (per 100,000 people)')
stats.pearsonr(crime_rate_dropna.Medianhomevalue, crime_rate_dropna.Crime_Rate)

# T-Test Comparisons of Low vs High Income Zip Codes

To divide Zip Codes into lower or higher income areas, we used the Department of Housing and Urban Development's (HUD) definition of lower income, which is below 80% of the median income for the metropolitain area. The median household income for Austin, TX in 2015 was \\$57,689, which sets our threshold at \\$46,151.20. The median household income of each Zip Code was then compared as either above or below this threshold. 

In [None]:
crime_rate['relativeIncome'] = np.where(crime_rate['Medianhouseholdincome'] > 46151.2, 'above', 'below')
crime_rate.value_counts('relativeIncome')

## Where are the approximate locations of Zip Codes above or below the HUD threshold?

In [None]:
geoLoc_crimeRate = sns.scatterplot(data=crime_rate, x='Longitude', y='Latitude', hue='relativeIncome')
geoLoc_crimeRate.get_legend().set_title('Above/Below HUD Threshold') 
# geoLoc_crimeRate.get_figure().savefig('geoLocation_zips.png')

## Distribution of Crime Rate by Relation to HUD Threshold

In [None]:
relIncome_crime_dis = sns.kdeplot(data=crime_rate, x='Crime_Rate', hue='relativeIncome')
relIncome_crime_dis.set(title='Distribution of Crime Rate by Median Household Income', xlabel = 'Crime Rate')
relIncome_crime_dis.get_legend().set_title('Above/Below HUD Threshold') 
# relIncome_crime_dis.get_figure().savefig("relativeIncome.png", bbox_inches='tight')

## Is the difference statistically significant?

In [None]:
high_income = crime_rate.loc[(crime_rate.relativeIncome == 'above'), 'Crime_Rate']
low_income = crime_rate.loc[(crime_rate.relativeIncome == 'below'), 'Crime_Rate']
stats.ttest_ind(high_income, low_income)

We have some evidence that the crime rate for Zip Codes with lower income is higher than for Zip Codes with lower incomes. 

## How big is the difference?

In [None]:
avg_crimeRate_income = crime_rate.groupby('relativeIncome').agg({'Crime_Rate': ['min', 'max', 'mean', 'median', 'std']})
sns.boxplot(data=crime_rate, x = 'Crime_Rate', y = 'relativeIncome')
display(avg_crimeRate_income)

For 2015, the average of the lower income Zip Codes had about 1808 more crimes per 100,000 people than the average of the higher income Zip Codes.