In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
crime = pd.read_csv('crime-housing-austin-2015.csv')

In [None]:
#Report_Date column is a String.
crime['Report_Date'].dtype
crime['Report_Date'] = pd.to_datetime(crime['Report_Date'], format='%d-%b-%y')

In [None]:
YEAR = 2015
spring_start = pd.Timestamp(month=3, day=21, year=YEAR)
summer_start = pd.Timestamp(month=6, day=21, year=YEAR)
fall_start = pd.Timestamp(month=9, day=23, year=YEAR)
winter_start = pd.Timestamp(month=12, day=21, year=YEAR)

def get_season(date):
    if date >= spring_start and date < summer_start:
        return 'Spring'
    elif date >= summer_start and date < fall_start:
        return 'Summer'
    elif date >= fall_start and date < winter_start:
        return 'Fall'
    else:
        return 'Winter'

crime['Season'] = crime['Report_Date'].apply(get_season)

In [None]:
season_counts = crime.groupby('Season').agg({'Report_Date':'size'}).reset_index()
display(season_counts)
day_counts = crime.groupby(['Report_Date', 'Season']).size().reset_index(name='count')
sns.barplot(data=season_counts, x='Season', y='Report_Date')
plt.title('Total crimes by season')
plt.figure()
sns.scatterplot(data=day_counts, x='Report_Date', y='count')
plt.title('Number of crimes each day')

In [None]:
day_counts['count']
sns.displot(data=day_counts, x='count', kind='kde', hue='Season')

Anova Assumptions: 
+ Data are normally distributed. Looks like it from the plots, and n is large.
+ Data have the same variance: Could test this more but probably good
+ Data are independent: The scatterplot above is evidence of this.


In [None]:
import scipy.stats as stats
#Significance threshold: .05
display(stats.f_oneway(day_counts[day_counts['Season'] == 'Winter']['count'], 
               day_counts[day_counts['Season'] == 'Spring']['count'], 
               day_counts[day_counts['Season'] == 'Summer']['count'], 
               day_counts[day_counts['Season'] == 'Fall']['count']))


In [None]:
#We can do this since the anova gave a significant result.

tukey_pvalue = stats.tukey_hsd(day_counts[day_counts['Season'] == 'Winter']['count'], 
               day_counts[day_counts['Season'] == 'Spring']['count'], 
               day_counts[day_counts['Season'] == 'Summer']['count'], 
               day_counts[day_counts['Season'] == 'Fall']['count']).pvalue
tukey = pd.DataFrame(tukey_pvalue)
tukey.columns=['Winter', 'Spring', 'Summer', 'Fall']
tukey

This shows that winter and summer are significantly different, but none of the others.

## Crime Type

In [None]:
crime_types = crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

In [None]:
plt.figure(figsize=(8,4))
bar = sns.barplot(x='Highest_NIBRS_UCR_Offense_Description', y='crime_sum', data=crime_types)
bar.set_xticklabels(bar.get_xticklabels(), rotation=0)
bar.set(xlabel='Crimes', ylabel='Number of Crimes', title='Crimes by Number of Crimes')
plt.show()

## Crimes by Season

In [None]:
fall_crime = crime[crime['Season'] == 'Fall']
crime_types_fall = fall_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

spring_crime = crime[crime['Season'] == 'Spring']
crime_types_spring = spring_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

winter_crime = crime[crime['Season'] == 'Winter']
crime_types_winter = winter_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

summer_crime = crime[crime['Season'] == 'Summer']
crime_types_summer = summer_crime.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].nunique().reset_index(name='crime_sum')

In [None]:
plt.figure(figsize=(8,4))
plt.ylim(0,8000)
bar = sns.barplot(x='Highest_NIBRS_UCR_Offense_Description', y='crime_sum', data=crime_types_fall)
bar.set_xticklabels(bar.get_xticklabels(), rotation=0)
bar.set(xlabel='Crimes', ylabel='Number of Crimes', title='Crimes by Number of Crimes in Fall')
plt.show()

plt.figure(figsize=(8,4))
plt.ylim(0,8000)
bar = sns.barplot(x='Highest_NIBRS_UCR_Offense_Description', y='crime_sum', data=crime_types_spring)
bar.set_xticklabels(bar.get_xticklabels(), rotation=0)
bar.set(xlabel='Crimes', ylabel='Number of Crimes', title='Crimes by Number of Crimes in Spring')
plt.show()

plt.figure(figsize=(8,4))
plt.ylim(0,8000)
bar = sns.barplot(x='Highest_NIBRS_UCR_Offense_Description', y='crime_sum', data=crime_types_winter)
bar.set_xticklabels(bar.get_xticklabels(), rotation=0)
bar.set(xlabel='Crimes', ylabel='Number of Crimes', title='Crimes by Number of Crimes in Winter')
plt.show()

plt.figure(figsize=(8,4))
plt.ylim(0,8000)
bar = sns.barplot(x='Highest_NIBRS_UCR_Offense_Description', y='crime_sum', data=crime_types_summer)
bar.set_xticklabels(bar.get_xticklabels(), rotation=0)
bar.set(xlabel='Crimes', ylabel='Number of Crimes', title='Crimes by Number of Crimes in Summer')
plt.show()