In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
crime = pd.read_csv('crime-housing-austin-2015.csv')

In [None]:
#Report_Date column is a String.
crime['Report_Date'].dtype
crime['Report_Date'] = pd.to_datetime(crime['Report_Date'], format='%d-%b-%y')

In [None]:
YEAR = 2015
spring_start = pd.Timestamp(month=3, day=21, year=YEAR)
summer_start = pd.Timestamp(month=6, day=21, year=YEAR)
fall_start = pd.Timestamp(month=9, day=23, year=YEAR)
winter_start = pd.Timestamp(month=12, day=21, year=YEAR)

def get_season(date):
    if date >= spring_start and date < summer_start:
        return 'Spring'
    elif date >= summer_start and date < fall_start:
        return 'Summer'
    elif date >= fall_start and date < winter_start:
        return 'Fall'
    else:
        return 'Winter'

crime['Season'] = crime['Report_Date'].apply(get_season)

In [None]:
season_counts = crime.groupby('Season').agg({'Report_Date':'size'}).reset_index()
display(season_counts)
day_counts = crime.groupby(['Report_Date', 'Season']).size().reset_index(name='count')
sns.barplot(data=season_counts, x='Season', y='Report_Date')
plt.title('Total crimes by season')
plt.figure()
sns.scatterplot(data=day_counts, x='Report_Date', y='count')
plt.title('Number of crimes each day')

In [None]:
day_counts['count']
sns.displot(data=day_counts, x='count', kind='kde', hue='Season')

Anova Assumptions: 
+ Data are normally distributed. Looks like it from the plots, and n is large.
+ Data have the same variance: Could test this more but probably good
+ Data are independent: The scatterplot above is evidence of this.


In [None]:
import scipy.stats as stats
#Significance threshold: .05
# Null hypothesis: all groups have the same population mean
display(stats.f_oneway(day_counts[day_counts['Season'] == 'Winter']['count'], 
               day_counts[day_counts['Season'] == 'Spring']['count'], 
               day_counts[day_counts['Season'] == 'Summer']['count'], 
               day_counts[day_counts['Season'] == 'Fall']['count']))


In [None]:
#We can do this since the anova gave a significant result.

tukey_pvalue = stats.tukey_hsd(day_counts[day_counts['Season'] == 'Winter']['count'], 
               day_counts[day_counts['Season'] == 'Spring']['count'], 
               day_counts[day_counts['Season'] == 'Summer']['count'], 
               day_counts[day_counts['Season'] == 'Fall']['count']).pvalue
tukey = pd.DataFrame(tukey_pvalue)
tukey.columns=['Winter', 'Spring', 'Summer', 'Fall']
display(tukey)
print('Winter avg crimes per day: ', 
      day_counts[day_counts['Season'] == 'Winter']['count'].mean(), 
      '\nSummer avg crimes per day: ', 
      day_counts[day_counts['Season'] == 'Summer']['count'].mean())

This shows that winter and summer are significantly different, but none of the others. Why?

+ Tourists might be easy targets; or they might bring extra crime
+ It's easier to go out and do things when it's warm vs when it's cold.

## Crime Type

In [None]:
crime_types = crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

In [None]:
plt.figure(figsize=(8,4))
bar = sns.barplot(x='Highest_NIBRS_UCR_Offense_Description', y='crime_sum', data=crime_types)
bar.set_xticklabels(bar.get_xticklabels(), rotation=0)
bar.set(xlabel='Crimes', ylabel='Number of Crimes', title='Crimes by Number of Crimes')
plt.show()

## Crimes by Season

In [None]:
fall_crime = crime[crime['Season'] == 'Fall']
crime_types_fall = fall_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

spring_crime = crime[crime['Season'] == 'Spring']
crime_types_spring = spring_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

winter_crime = crime[crime['Season'] == 'Winter']
crime_types_winter = winter_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

summer_crime = crime[crime['Season'] == 'Summer']
crime_types_summer = summer_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

## Crime and Median Household Income

In [None]:
crime['Medianhouseholdincome'] = crime['Medianhouseholdincome'].str.replace('$', '').str.replace(' ', '').str.replace(',', '').astype('float')


In [None]:
crime['Medianhouseholdincome'].describe()

In [None]:
def get_poor(median_income):
    if median_income <= 41869:
        return 1
    else:
        return 0

crime['poor_indicator'] = crime['Medianhouseholdincome'].apply(get_poor)

In [None]:
def get_severity(type):
    if type == 'Theft': #3.7 Years
        return 1
    elif type == 'Auto Theft': #4.0 Years
        return 2
    elif type == 'Agg Assault': #5.6 Years
        return 3
    elif type == 'Burglary': #5.8 Years
        return 4
    elif type == 'Robbery': #9.0 Years
        return 5
    elif type == 'Rape': #12.2 Years
        return 6
    elif type == 'Murder Crimes': #40.6 Years
        return 7

crime['severity_indicator'] = crime['Highest_NIBRS_UCR_Offense_Description'].apply(get_severity)

In [None]:
sns.displot(data=crime, x='Medianhouseholdincome', kind='kde', hue='severity_indicator')

Distributions are far from normal and the sample sizes are not close to being even.

## Median Income by Number of Crimes

In [None]:
pop_density = pd.read_csv('AustinZipCodes.csv')

In [None]:
zip_crime_counts = crime.groupby(['Medianhouseholdincome', 'Zip_Code_Crime']).size().reset_index(name='count')


In [None]:
zip_density = pd.merge(pop_density, zip_crime_counts, left_on='Zip Code', right_on='Zip_Code_Crime', how='inner')
zip_density['Population'] = zip_density['Population'].str.replace(',', '').astype(float)

In [None]:
zip_density['crime_per_population'] = zip_density['count'] / zip_density['Population'].astype(int)
sns.scatterplot(data=zip_density, x='Medianhouseholdincome', y='crime_per_population')
plt.title('Scatter Plot of Median Household Income by Crime Per Population')
zip_density_no_outlier = zip_density[zip_density['crime_per_population'] < .5]
plt.figure()
plt.title('Scatter Plot of Median Household Income by Crime Per Population (no outlier)')
sns.scatterplot(data=zip_density_no_outlier, x='Medianhouseholdincome', y='crime_per_population')

In [None]:
stats.pearsonr(zip_density_no_outlier['Medianhouseholdincome'], zip_density_no_outlier['crime_per_population'])