In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [None]:
crime = pd.read_csv('crime-housing-austin-2015.csv')

In [None]:
#Report_Date column is a String.
crime['Report_Date'].dtype
crime['Report_Date'] = pd.to_datetime(crime['Report_Date'], format='%d-%b-%y')

In [None]:
YEAR = 2015
spring_start = pd.Timestamp(month=3, day=21, year=YEAR)
summer_start = pd.Timestamp(month=6, day=21, year=YEAR)
fall_start = pd.Timestamp(month=9, day=23, year=YEAR)
winter_start = pd.Timestamp(month=12, day=21, year=YEAR)

def get_season(date):
    if date >= spring_start and date < summer_start:
        return 'Spring'
    elif date >= summer_start and date < fall_start:
        return 'Summer'
    elif date >= fall_start and date < winter_start:
        return 'Fall'
    else:
        return 'Winter'

crime['Season'] = crime['Report_Date'].apply(get_season)

In [None]:
# Horizontal chart of crimes by season
plt.figure(figsize=(10, 10))
sns.countplot(y='Highest_NIBRS_UCR_Offense_Description', hue='Season', data=crime)

In [None]:
season_counts = crime.groupby('Season').agg({'Report_Date':'size'}).reset_index()
display(season_counts)
day_counts = crime.groupby(['Report_Date', 'Season']).size().reset_index(name='count')
sns.barplot(data=season_counts, x='Season', y='Report_Date')
plt.title('Total crimes by season')
plt.figure()
sns.scatterplot(data=day_counts, x='Report_Date', y='count')
plt.title('Number of crimes each day')

In [None]:
day_counts['count']
sns.displot(data=day_counts, x='count', kind='kde', hue='Season')

Anova Assumptions: 
+ Data are normally distributed. Looks like it from the plots, and n is large.
+ Data have the same variance: Could test this more but probably good
+ Data are independent: The scatterplot above is evidence of this.


In [None]:
import scipy.stats as stats
#Significance threshold: .05
# Null hypothesis: all groups have the same population mean
display(stats.f_oneway(day_counts[day_counts['Season'] == 'Winter']['count'], 
               day_counts[day_counts['Season'] == 'Spring']['count'], 
               day_counts[day_counts['Season'] == 'Summer']['count'], 
               day_counts[day_counts['Season'] == 'Fall']['count']))


In [None]:
#We can do this since the anova gave a significant result.

tukey_pvalue = stats.tukey_hsd(day_counts[day_counts['Season'] == 'Winter']['count'], 
               day_counts[day_counts['Season'] == 'Spring']['count'], 
               day_counts[day_counts['Season'] == 'Summer']['count'], 
               day_counts[day_counts['Season'] == 'Fall']['count']).pvalue
tukey = pd.DataFrame(tukey_pvalue)
tukey.columns=['Winter', 'Spring', 'Summer', 'Fall']
display(tukey)
print('Winter avg crimes per day: ', 
      day_counts[day_counts['Season'] == 'Winter']['count'].mean(), 
      '\nSummer avg crimes per day: ', 
      day_counts[day_counts['Season'] == 'Summer']['count'].mean())

This shows that winter and summer are significantly different, but none of the others. Why?

+ Tourists might be easy targets; or they might bring extra crime
+ It's easier to go out and do things when it's warm vs when it's cold.

## Crime Type

In [None]:
crime_types = crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

In [None]:
plt.figure(figsize=(8,4))
bar = sns.barplot(x='Highest_NIBRS_UCR_Offense_Description', y='crime_sum', data=crime_types)
bar.set_xticklabels(bar.get_xticklabels(), rotation=0)
bar.set(xlabel='Crime Type', ylabel='Number of Crimes', title='Crime Type vs. Number of Crimes')
plt.show()

## Crimes by Season

In [None]:
fall_crime = crime[crime['Season'] == 'Fall']
crime_types_fall = fall_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

spring_crime = crime[crime['Season'] == 'Spring']
crime_types_spring = spring_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

winter_crime = crime[crime['Season'] == 'Winter']
crime_types_winter = winter_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

summer_crime = crime[crime['Season'] == 'Summer']
crime_types_summer = summer_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

## Crime and Median Household Income

In [None]:
crime['Medianhouseholdincome'] = crime['Medianhouseholdincome'].str.replace('$', '').str.replace(' ', '').str.replace(',', '').astype('float')


In [None]:
crime['Medianhouseholdincome'].describe()

In [None]:
def get_poor(median_income):
    if median_income <= 41869:
        return 1
    else:
        return 0

crime['poor_indicator'] = crime['Medianhouseholdincome'].apply(get_poor)

In [None]:
def get_severity(type):
    if type == 'Theft': #3.7 Years
        return 1
    elif type == 'Auto Theft': #4.0 Years
        return 2
    elif type == 'Agg Assault': #5.6 Years
        return 3
    elif type == 'Burglary': #5.8 Years
        return 4
    elif type == 'Robbery': #9.0 Years
        return 5
    elif type == 'Rape': #12.2 Years
        return 6
    elif type == 'Murder Crimes': #40.6 Years
        return 7

crime['severity_indicator'] = crime['Highest_NIBRS_UCR_Offense_Description'].apply(get_severity)

In [None]:
sns.displot(data=crime, x='Medianhouseholdincome', kind='kde', hue='severity_indicator')

Distributions are far from normal and the sample sizes are not close to being even.

## Median Income by Number of Crimes

In [None]:
pop_density = pd.read_csv('AustinZipCodes.csv')

In [None]:
zip_crime_counts = crime.groupby(['Medianhouseholdincome', 'Zip_Code_Crime']).size().reset_index(name='count')


In [None]:
zip_density = pd.merge(pop_density, zip_crime_counts, left_on='Zip Code', right_on='Zip_Code_Crime', how='inner')
zip_density['Population'] = zip_density['Population'].str.replace(',', '').astype(float)

In [None]:
zip_density['crime_per_population'] = zip_density['count'] / zip_density['Population'].astype(int)
sns.scatterplot(data=zip_density, x='Medianhouseholdincome', y='crime_per_population')
plt.title('Scatter Plot of Median Household Income by Crime Per Population')
zip_density_no_outlier = zip_density[zip_density['crime_per_population'] < .5]
plt.figure()
plt.title('Scatter Plot of Median Household Income by Crime Per Population (no outlier)')
sns.scatterplot(data=zip_density_no_outlier, x='Medianhouseholdincome', y='crime_per_population')

In [None]:
stats.pearsonr(zip_density_no_outlier['Medianhouseholdincome'], zip_density_no_outlier['crime_per_population'])

In [None]:
crime.info()

# Is there a relationship between affordable housing and different types of crime?

In [None]:
crime = crime.rename(
    columns={'Homesaffordabletopeopleearninglessthan$50000':'affordability_50k_pct',
             'Rentalsaffordabletopeopleearninglessthan$25000': 'rent_affordability_25k_pct'}
)
crime['affordability_50k_pct'] = crime['affordability_50k_pct'].str.replace('%', '').astype(float)
crime['rent_affordability_25k_pct'] = crime['rent_affordability_25k_pct'].str.replace('%', '').astype(float)

In [None]:
# Get zip code of crime against homes affordable to people earning less than $50,000
zip_affordable = crime[['Zip_Code_Crime', 'Zip_Code_Housing', 'affordability_50k_pct', 'rent_affordability_25k_pct']].drop_duplicates()
zip_affordable.head(15)

Dropping duplicates here looks at all columns by default so we aren't losing any valuable information.

It looks like all housing and crime zip codes are the same for non NaN rows. Let's just get the Zip_Code_Crime column and drop the Zip_Code_Housing one, since the former seems to be more complete, and drop the remaining NaN rows.

In [None]:
zip_afforable = zip_affordable.drop(columns=['Zip_Code_Housing']).dropna()
zip_afforable.head(10)

In [None]:
# get counts of each type of crime in each zip code
crime_counts = crime.groupby(['Zip_Code_Crime', 'Highest_NIBRS_UCR_Offense_Description', ]).size().reset_index(name='counts')

zip_counts = zip_afforable.merge(right=crime_counts, on="Zip_Code_Crime")
zip_counts

In [None]:
# We need to account for population
pops = pd.read_csv('AustinZipCodes.csv')
pops = pops[['Zip Code', 'Population']].sort_values(by='Zip Code').reset_index(drop=True)
pops.head(10)

In [None]:
# Merge populations with zip_crime and calculate thefts per 1000 people
zip_crime_p1000 = zip_counts.merge(right=pops, left_on='Zip_Code_Crime', right_on='Zip Code').drop(columns=['Zip_Code_Crime'])
zip_crime_p1000

zip_crime_p1000['Population'] = zip_crime_p1000['Population'].str.replace(',', '').astype(int)
zip_crime_p1000['count_p1000'] = zip_crime_p1000['counts'] / zip_crime_p1000['Population'] * 1000 # Crime count per 1000 people
zip_crime_p1000

In [None]:
total_crime_p1000 = zip_crime_p1000.groupby(['Zip Code', 'affordability_50k_pct', 'rent_affordability_25k_pct']).agg({'count_p1000': 'sum'}).reset_index()

sns.scatterplot(data=total_crime_p1000, x='affordability_50k_pct', y='count_p1000')
plt.title("Total crime counts per 1000 people vs percent\naffordable housing in each zip code")

# now do a pearson correlation
pearsonr(total_crime_p1000['affordability_50k_pct'], total_crime_p1000['count_p1000'])

In [None]:
# note the outlier
display(total_crime_p1000[total_crime_p1000['count_p1000'] > 400])

# Get rid of the outlier
total_crime_no_outlier = total_crime_p1000[total_crime_p1000['count_p1000'] < 400]
sns.scatterplot(data=total_crime_no_outlier, x='affordability_50k_pct', y='count_p1000')
plt.title("Total crime counts per 1000 people vs percent\naffordable housing in each zip code")

# now do a pearson correlation
pearsonr(total_crime_no_outlier['affordability_50k_pct'], total_crime_no_outlier['count_p1000'])

## Different types of crime vs affordable housing

### Burglary

In [None]:
# Get burglaries per 1000 people against affordability
burglary_p1000_housing = zip_crime_p1000[zip_crime_p1000['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']

sns.scatterplot(data=burglary_p1000_housing, x='affordability_50k_pct', y='count_p1000')
plt.title("Burglary counts per 1000 people vs percent\naffordable housing in each zip code")

# now do a pearson correlation
pearsonr(burglary_p1000_housing['affordability_50k_pct'], burglary_p1000_housing['count_p1000'])

In [None]:
# Note the obvious outliers
display(burglary_p1000_housing[burglary_p1000_housing['count_p1000'] > 20])

# Remove the outlier
burglary_no_outliers = burglary_p1000_housing[burglary_p1000_housing['count_p1000'] < 20]
sns.scatterplot(data=burglary_no_outliers, x='affordability_50k_pct', y='count_p1000')
plt.title("Burglary counts per 1000 people vs percent\naffordable housing in each zip code (no outlier)")

# now do a pearson correlation
pearsonr(burglary_no_outliers['affordability_50k_pct'], burglary_no_outliers['count_p1000'])

### Theft

In [None]:
# Get thefts per 1000 people against affordability
theft_p1000_housing = zip_crime_p1000[zip_crime_p1000['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']

sns.scatterplot(data=theft_p1000_housing, x='affordability_50k_pct', y='count_p1000')
plt.title("Theft counts per 1000 people vs percent\naffordable housing in each zip code")

# now do a pearson correlation
pearsonr(theft_p1000_housing['affordability_50k_pct'], theft_p1000_housing['count_p1000'])

In [None]:
# Note the obvious outlier
display(theft_p1000_housing[theft_p1000_housing['count_p1000'] > 100])

# Remove the outlier
theft_no_outliers = theft_p1000_housing[theft_p1000_housing['count_p1000'] < 100]
sns.scatterplot(data=theft_no_outliers, x='affordability_50k_pct', y='count_p1000')
plt.title("Theft counts per 1000 people vs percent\naffordable housing in each zip code (no outlier)")

# now do a pearson correlation
pearsonr(theft_no_outliers['affordability_50k_pct'], theft_no_outliers['count_p1000'])

# What about rental housing?

In [None]:
sns.scatterplot(data=total_crime_p1000, x='rent_affordability_25k_pct', y='count_p1000')
plt.title("Total crime counts per 1000 people vs percent\naffordable rental housing in each zip code")

# Pearson correlation
pearsonr(total_crime_p1000['rent_affordability_25k_pct'], total_crime_p1000['count_p1000'])

In [None]:
# Note the outlier
display(total_crime_p1000[total_crime_p1000['count_p1000'] > 400])

# Get rid of the outlier
total_crime_no_outlier = total_crime_p1000[total_crime_p1000['count_p1000'] < 400]
sns.scatterplot(data=total_crime_no_outlier, x='rent_affordability_25k_pct', y='count_p1000')
plt.title("Total crime counts per capita vs percent\naffordable rental housing in each zip code (no outlier)")

# Pearson correlation
pearsonr(total_crime_no_outlier['rent_affordability_25k_pct'], total_crime_no_outlier['count_p1000'])

### Burglary

In [None]:
# Get burglaries per 1000 people against affordability
burglary_p1000_rental = zip_crime_p1000[zip_crime_p1000['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']

sns.scatterplot(data=burglary_p1000_rental, x='rent_affordability_25k_pct', y='count_p1000')
plt.title("Burglary counts per 1000 people vs percent\naffordable housing in each zip code")

# do a pearson correlation
pearsonr(burglary_p1000_rental['rent_affordability_25k_pct'], burglary_p1000_rental['count_p1000'])

In [None]:
# Note the outliers above 15
display(burglary_p1000_rental[burglary_p1000_rental['count_p1000'] > 15])

# Remove the outliers
burglary_no_outliers = burglary_p1000_rental[burglary_p1000_rental['count_p1000'] < 15]
sns.scatterplot(data=burglary_no_outliers, x='rent_affordability_25k_pct', y='count_p1000')
plt.title("Burglary counts per 1000 people vs percent\naffordable rental housing in each zip code(no outliers)")

# do a pearson correlation
pearsonr(burglary_no_outliers['rent_affordability_25k_pct'], burglary_no_outliers['count_p1000'])

### Theft

In [None]:
# Plot thefts per capita per 1000 people against affordability
theft_p1000_rental = zip_crime_p1000[zip_crime_p1000['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']

sns.scatterplot(data=theft_p1000_rental, x='rent_affordability_25k_pct', y='count_p1000')
plt.title("Theft counts per 1000 people vs percent\naffordable housing in each zip code")

# do a pearson correlation
pearsonr(theft_p1000_rental['rent_affordability_25k_pct'], theft_p1000_rental['count_p1000'])

In [None]:
# Note the outliers above 15
display(theft_p1000_rental[theft_p1000_rental['count_p1000'] > 100])

# Remove the outliers
theft_no_outliers = theft_p1000_rental[theft_p1000_rental['count_p1000'] < 100]
sns.scatterplot(data=theft_no_outliers, x='rent_affordability_25k_pct', y='count_p1000')
plt.title("Theft counts per 1000 people vs percent\naffordable rental housing in each zip code (no outlier)")

# do a pearson correlation
pearsonr(theft_no_outliers['rent_affordability_25k_pct'], theft_no_outliers['count_p1000'])

# Cool extra thing

In [None]:
# Plot crimes by x and y coordinates
plt.figure(figsize=(10, 10))
sns.scatterplot(x='X_Coordinate', y='Y_Coordinate', data=crime, alpha=0.1, s=5)