In [None]:
#Shift+tab = documentation
#Tab = Auto-complete
#Start at Preparing Non-Uniform Time Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
#import tick customization tools
import matplotlib.ticker as mticks
import matplotlib.dates as mdates
## Setting figures to timeseries-friendly
plt.rcParams['figure.figsize'] = (12,4)
plt.rcParams['figure.facecolor'] = 'white'
sns.set_context("talk", font_scale=0.9)
# set random seed
SEED = 321
np.random.seed(SEED)
#display more columns
pd.set_option('display.max_columns',50)



In [6]:
df = pd.read_csv('Data/Part_1_Crime_Data.csv',low_memory=False)
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587673 entries, 0 to 587672
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   X                586689 non-null  float64
 1   Y                586689 non-null  float64
 2   RowID            587673 non-null  int64  
 3   CCNumber         587673 non-null  object 
 4   CrimeDateTime    587673 non-null  object 
 5   CrimeCode        587673 non-null  object 
 6   Description      587673 non-null  object 
 7   Inside_Outside   536412 non-null  object 
 8   Weapon           145090 non-null  object 
 9   Post             579693 non-null  object 
 10  Gender           493050 non-null  object 
 11  Age              473013 non-null  float64
 12  Race             567987 non-null  object 
 13  Ethnicity        61611 non-null   object 
 14  Location         584616 non-null  object 
 15  Old_District     563488 non-null  object 
 16  New_District     16201 non-null   obje

Unnamed: 0,X,Y,RowID,CCNumber,CrimeDateTime,CrimeCode,Description,Inside_Outside,Weapon,Post,Gender,Age,Race,Ethnicity,Location,Old_District,New_District,Neighborhood,Latitude,Longitude,GeoLocation,PremiseType,Total_Incidents
0,-76.584281,39.282869,1,12A01562,2011/12/30 09:37:00+00,6E,LARCENY,I,,214,M,42.0,UNKNOWN,,2300 BOSTON ST,SOUTHEAST,,CANTON,39.282869,-76.584281,"(39.282869,-76.584281)",GARAGE ON PRIV. PROP,1
1,-76.570626,39.367758,2,11L12669,2011/12/30 00:36:00+00,3JF,ROBBERY,I,FIREARM,423,M,22.0,BLACK_OR_AFRICAN_AMERICAN,,6600 MOONFLOWER CT,NORTHEAST,,HAMILTON HILLS,39.367758,-76.570626,"(39.367758,-76.570626)",ROW/TOWNHOUSE-OCC,1
2,-76.562701,39.323386,3,11L13076,2011/12/30 21:00:00+00,3AO,ROBBERY,O,OTHER,432,M,,WHITE,,3400 SHANNON DR,NORTHEAST,,BELAIR-EDISON,39.323386,-76.562701,"(39.323386,-76.562701)",ALLEY,1
3,-76.602909,39.232993,4,12A00628,2011/12/30 09:00:00+00,5A,BURGLARY,I,,913,F,28.0,WHITE,,600 E JEFFREY ST,SOUTHERN,,BROOKLYN,39.232993,-76.602909,"(39.232993,-76.602909)",ROW/TOWNHOUSE-OCC,1
4,-76.576114,39.292284,5,11L13140,2011/12/30 00:13:00+00,4C,AGG. ASSAULT,O,OTHER,222,M,21.0,BLACK_OR_AFRICAN_AMERICAN,,2900 E BALTIMORE ST,SOUTHEAST,,PATTERSON PARK NEIGHBORHOOD,39.292284,-76.576114,"(39.292284,-76.576114)",STREET,1


In [None]:
test_date = df.loc[0, 'CrimeDateTime']
test_date



In [None]:
test_datetime = pd.to_datetime(test_date)
test_datetime



In [None]:
#Will cause an error
#df['CrimeDateTime'] = pd.to_datetime(df['CrimeDateTime'])



In [None]:
df['CrimeDateTime'] = pd.to_datetime(df['CrimeDateTime'],errors='coerce')



In [None]:
# how many null values did we create?
df['CrimeDateTime'].isna().sum()



In [None]:
# drop the single erroneous time
df = df.dropna(subset=['CrimeDateTime'])
df['CrimeDateTime'].isna().sum()



In [None]:
## now that we remove NaT's let's make our datetime index
df = df.set_index('CrimeDateTime')
df.head(3)


In [None]:
df = df.sort_index()
df.index



In [None]:
years = df.index.year
years.value_counts().sort_index().plot(kind='bar');


In [None]:
## keeping 2011 to present
df = df.loc['2011':]
df.index



In [None]:
## Inspect the value_counts for the different types of crimes
crime_counts = df['Description'].value_counts(normalize=True)
crime_counts.sort_values().plot(kind='barh',figsize=(5,8))


In [None]:
## Inspect the value_counts for the different types of crimes
crime_counts = df['Description'].value_counts().to_frame('Total # of Crimes')
crime_counts


In [None]:
# display with an inline-barplot inside your df
crime_counts.style.bar('Total # of Crimes')


In [None]:
## demoing groupby with .counts() vs .size()
df.groupby("Description").count().head()



In [None]:
df.groupby("Description").size().head()



In [None]:
## making our dictionary 
CRIMES = {}



In [None]:
## Lets test 1 crime before we create our loop
crime ='ROBBERY'
crime



In [None]:
# I. Save a temp df of just the rows that match the crime
temp = df.loc[ df['Description']==crime]
temp.head(3)


In [None]:
# II. Resample the temp DataFrame as Daily data (crime counts)
# and keep ONLY the .size()
temp_res = temp.resample("D").size()
temp_res



In [None]:
#III. Save the temporary DataFrame in the dictionary, #using the crime description as the key.
CRIMES[crime] = temp_res.copy()
CRIMES.keys()



In [None]:
CRIMES['ROBBERY']



In [None]:
# 1. get list of  unique crime descriptions
crime_list = df['Description'].unique()
crime_list



In [None]:
# 2. Create an empty dictionary  
CRIMES = {}
# 3. Loop through the list of crimes
for crime in crime_list:
    # I. Save a temp df of just the rows that match the crime
   temp = df.loc[ df['Description']==crime].copy()
    # II. Resample the temp DataFrame as Daily data (crime counts) 
    # and keep ONLY the .size() 
   temp_res = temp.resample("D").size() 
   #III. Save the temporary DataFrame in the dictionary,
   #using the crime description as the key.
   CRIMES[crime] = temp_res.copy() 
CRIMES.keys()



In [None]:
CRIMES['SHOOTING']



In [None]:
crimes_df = pd.DataFrame(CRIMES)
crimes_df



In [None]:
## saving to disk for later
crimes_df.to_csv("Data/Baltimore/baltimore_crime_counts_2023.csv")


# Timezones

In [None]:
## testing our saved data
df = pd.read_csv("Data/Baltimore/baltimore_crime_counts_2023.csv", parse_dates=['CrimeDateTime'], index_col=0)


In [None]:
## check our df.index
df.index



In [None]:
df = df.resample('D').asfreq()
df.head(3)


In [None]:
ts0 = df.index[0]
ts0



In [None]:
# checking the documentation for astimezone
ts0.astimezone?



In [None]:
import pytz
all_tzs = pytz.all_timezones
len(all_tzs)



In [None]:
## getting just US timezones
[tz for tz in all_tzs if tz.startswith('US')]



In [None]:
## let's try converting to US Eastern
ts0.astimezone('US/Eastern')


In [None]:
## Viewing original 
ts0



In [None]:
## let's try converting to US Pacific
ts0.astimezone('US/Pacific')



In [None]:
## remove time zone from the dt index
df = df.tz_convert(None)
df.head(3)



# Binning Time Series

In [None]:
# Set options
pd.set_option('display.max_columns',100)
# Customize figure style for stakeholder-facing visualizations
plt.style.use(('ggplot','fivethirtyeight'))
sns.set_context('notebook', font_scale=1.2)
plt.rcParams['figure.figsize'] = (12,4)
plt.rcParams['savefig.transparent'] = False
plt.rcParams['savefig.bbox'] = 'tight'



In [None]:
df = pd.read_csv("YOUR PATH TO ORIGINAL CRIME DATA HERE",
                 low_memory=False)
## Converting crimedatetime and coercing errors
df['CrimeDateTime'] = pd.to_datetime(df['CrimeDateTime'], errors='coerce')
df = df.dropna(subset=['CrimeDateTime'])
## setting and sorting the index
df = df.set_index("CrimeDateTime")
df = df.sort_index()
df.head(3)



In [None]:
## baltimore is in eastern time zone,but can just use tz-naive index by choosing None
df = df.tz_convert(None)
df.head(3)



In [None]:
## Drop extra columns not needed
drop_cols = ['RowID','CCNO','Post','GeoLocation', 'Location','Premise']
df = df.drop(columns=drop_cols)
df.info()



In [None]:
## saving just the date (no time) as a column
df['Date'] = df.index.date
# could also use .strftime + pd.to_datetime
# df['Date'] = pd.to_datetime(df.index.strftime('%Y-%m-%d'))
df.head(3)


In [None]:
## Engineering Features using components of the date/time
df['Year']  = df.index.year



In [None]:
## Visualize years
ax = sns.countplot(data=df, x='Year')
fig = ax.get_figure()
fig.autofmt_xdate()



In [None]:
df['Year'].value_counts().sort_index(ascending = False).head(15)



In [None]:
## keeping data from yr 2011 to present
df = df.loc['2011':'2022'].copy()



In [None]:
## visualizing again with years removed
sns.countplot(data=df, x='Year',palette='dark');



In [None]:
## save year counts
year_counts = df['Year'].value_counts().sort_index()
year_counts.plot(style='o-',grid=True,ylabel='# of Crimes', xlabel='Year');



In [None]:
## Extracting the Month
df['Month'] = df.index.month
df['MonthName'] = df.index.month_name()
df.head(3)



In [None]:
# saving lookup for number vs name
unique_months = df.drop_duplicates(subset=['Month']).sort_values('Month')
month_lookup =dict(zip(unique_months['Month'],unique_months['MonthName']))
month_lookup



In [None]:
ax = sns.countplot(data=df, x='MonthName',palette='dark',order=month_lookup.values());
# Rotate xtick labels and align the text to the bar
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');



In [None]:
## save month counts
month_counts = df['MonthName'].value_counts()[month_lookup.values()]
# year_counts = df.groupby("Year").size()
ax = month_counts.plot(style='o-',grid=True,
                       ylabel='# of Crimes',
                       xlabel='Month');



In [None]:
## adding day of week as both number & name
df['DayOfWeek'] = df.index.day_name()
df['DayNum'] = df.index.day_of_week
df.head(3)



In [None]:
# saving lookup for number vs named days
unique_days = df.drop_duplicates(subset=['DayNum']).sort_values('DayNum')
day_lookup =dict(zip(unique_days['DayNum'],unique_days['DayOfWeek']))
day_lookup



In [None]:
## plot DayNum, but use names from day_lookup for ticks
ax = sns.countplot(data=df, x='DayNum',palette='dark')
ax.set_xticklabels([v ="keyword from-rainbow">for k,v in day_lookup.items()]);



In [None]:
## save day counts
day_counts = df['DayNum'].value_counts().sort_index()
## now that its in order, rename using day_lookup
day_counts = day_counts.rename(day_lookup)
ax = day_counts.plot(style='o-',grid=True,ylabel='# of Crimes', xlabel='Day of Week')


In [None]:
df['Weekend'] = df['DayOfWeek'].isin(['Saturday','Sunday'])
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='Weekend',palette='dark');



In [None]:
## calculating value counts and then dividing week days by 5 and weekend by 2
weekend_counts = df["Weekend"].value_counts()
weekend_counts.loc[False] /= 5
weekend_counts.loc[True] /= 2
weekend_counts



In [None]:
ax = weekend_counts.plot(kind='bar',figsize=(8,4))
ax.set(ylabel='Crimes Per Day', xlabel='Is_Weekend');



In [None]:
# adding hour of the day (12am=0) 
df['HourOfDay'] = df.index.hour 
sns.countplot(data=df, x='HourOfDay',palette='dark');



In [None]:
## save hour counts
hour_counts = df['HourOfDay'].value_counts().sort_index()
# year_counts = df.groupby("Year").size()
ax = hour_counts.plot(style='o-',grid=True,ylabel='# of Crimes', xlabel='Hour of Day',xticks=hour_counts.index);



# Seasonality

In [None]:
crimes = ['LARCENY','LARCENY FROM AUTO', 'AUTO THEFT','SHOOTING','ROBBERY - CARJACKING']
crimes_ts = df[crimes].copy()
crimes_ts.head()



In [None]:
crimes_ts.isna().sum()



In [None]:
## fill null values with 0
crimes_ts = crimes_ts.fillna(0)
crimes_ts



In [None]:
crimes_ts.plot(grid=True,alpha=0.7);



In [None]:
plot_df = crimes_ts.loc['2017':'2021']
plot_df.plot(grid=True);



In [None]:
plot_df.plot(subplots=True,figsize=(10,12),grid=True);



In [None]:
plot_df_W = crimes_ts.loc['2017':'2021'].resample('W').sum()
plot_df_W.plot(grid=True,subplots=True,figsize=(10,15));



In [None]:
plot_df_W.rolling(4).mean().dropna().plot(subplots=True,figsize=(10,12));



In [None]:
# import the statsmodels time series analysis api module
import statsmodels.tsa.api as tsa
tsa



In [None]:
crime_type = "LARCENY FROM AUTO"
ts = plot_df_W[crime_type]
ts.plot(title=crime_type)


In [None]:
decomp = tsa.seasonal_decompose(ts)
decomp



In [None]:
fig = decomp.plot()



In [None]:
fig = decomp.plot()
fig.set_size_inches(10,8)
fig.tight_layout()



In [None]:
decomp.trend



In [None]:
decomp.trend.plot(title='Trend');



In [None]:
decomp.seasonal.head()



In [None]:
decomp.seasonal.plot(title='Seasonal');



In [None]:
decomp.resid.head()



In [None]:
decomp.resid.plot(title='Residual');



In [None]:
# Adding decomposition components together
ts_model = decomp.trend + decomp.seasonal + decomp.resid
# Visualize the result
ax = ts_model.plot(label='Recombined from Decomposition')
ax.legend()


In [None]:
ax = ts.plot(label='Original, Raw')
ts_model.plot(label='Recombined from Decomposition',ax=ax, ls='--', alpha=0.8)
ax.legend();



In [None]:
seasonal = decomp.seasonal
ax = seasonal.plot(title=f'Seasonal Component for {crime_type}');
ax.grid(which='both', axis='x')



In [None]:
import matplotlib.dates as mdates
minor_loc = mdates.MonthLocator()
fig, ax = plt.subplots()
ax.plot(seasonal)
ax.set(title=f'Seasonal Component for {crime_type}');
ax.xaxis.set_minor_locator(minor_loc)
ax.grid(which='major', axis='x',lw=1, color='k')
ax.grid(which='minor',axis='x',lw=0.5)
fig.autofmt_xdate(rotation=90, ha='center')



In [None]:
seasonal.max()



In [None]:
seasonal.min()



In [None]:
magnitude = seasonal.max() - seasonal.min()
magnitude



In [None]:
seasonal.idxmax()



In [None]:
seasonal.idxmin()



In [None]:
import matplotlib.dates as mdates
def plot_seasonal(seasonal):
    minor_loc = mdates.MonthLocator()
    fig, ax = plt.subplots()
    ax.plot(seasonal)
    ax.set(title=f'Seasonal Component for {crime_type}');
    ax.xaxis.set_minor_locator(minor_loc)
    ax.grid(which='major', axis='x',lw=1, color='k')
    ax.grid(which='minor',axis='x',lw=0.5)
    fig.autofmt_xdate(rotation=90, ha='center')
    
    return fig, ax



In [None]:
fig, ax = plot_seasonal(seasonal)
ax.axvspan(seasonal.idxmin(),seasonal.idxmax(), color='lightgreen');



In [None]:
delta = seasonal.idxmax() - seasonal.idxmin()
delta



In [None]:
from scipy.signal import find_peaks
peaks, props = find_peaks(seasonal, height=seasonal.max())


In [None]:
peaks



In [None]:
props



In [None]:
peak_dates = seasonal.index[peaks]
peak_dates



In [None]:
fig,ax = plot_seasonal(seasonal)
ax.scatter(peak_dates, props['peak_heights'], color='orange');



In [None]:
peak_dates



In [None]:
period = peak_dates[1] - peak_dates[0]
period



In [None]:
periods = []
for i in range(len(peak_dates)-1):
    delta = peak_dates[i+1] - peak_dates[i]
    periods.append(delta)
periods



In [None]:
fig, ax = plot_seasonal(seasonal)
season_start = seasonal.idxmin()
ax.axvspan( season_start, season_start +period, color='lightgreen')
ax.axvline(season_start, color='green', ls='--',
           label=f"Start of Season {season_start.strftime('%B')}")
ax.legend();


In [None]:
seasonal.max() - seasonal.min()



# Feature Engineering - Holidays

In [None]:
!pip install holidays


In [None]:
import holidays
import datetime as dt
from holidays import country_holidays



In [None]:
## making a date range that covers full dataset
all_days = pd.date_range(df["Date"].min(), df["Date"].max())
all_days



In [None]:
## Create an instance of the US country holidays.
us_holidays = country_holidays('US')
us_holidays



In [None]:
## Testing first date
print(all_days[0])
us_holidays.get(all_days[0])



In [None]:
## Getting us holidays for all dates
holiday_list = [us_holidays.get(day) for day in all_days]
holiday_list[:5]



In [None]:
# For a specific subdivisions (e.g. state or province):
md_holidays = country_holidays('US', subdiv='MD')
md_holidays



In [None]:
## Saving both holiday types as columns
df["US Holiday"] = [us_holidays.get(day) for day in df['Date']]
df['MD Holiday'] = [md_holidays.get(day) for day in df['Date']]
df.head()



In [None]:
## US Holidays
df['US Holiday'].value_counts()



In [None]:
## MD Holidays
df['MD Holiday'].value_counts()



In [None]:
ax = sns.barplot(data=df, x='US Holiday',y='Total_Incidents',estimator=np.sum)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha='right');



In [None]:
## Saving a binary is holiday feature
df['Is_Holiday'] = df['US Holiday'].notna()
df['Is_Holiday'].value_counts()



# Stakeholder Questions

In [None]:
## Get # of crimes by Hour
crimes_by_hour = df.groupby('Description')['HourOfDay'].value_counts().sort_index()
crimes_by_hour



In [None]:
crimes_by_hour.plot();



In [None]:
## convert to dataframe and reset_index()
plot_df = crimes_by_hour.to_frame('# of Crimes').reset_index()
plot_df



In [None]:
ax = sns.lineplot(data=plot_df, x='HourOfDay',y='# of Crimes')



In [None]:
fig, ax = plt.subplots(figsize=(12,6))
sns.lineplot(data=plot_df, x='HourOfDay',y='# of Crimes',markers=True,
                  hue='Description',ax=ax,)
ax.legend(bbox_to_anchor=[1,1]);


In [None]:
g = sns.catplot(data=plot_df, x='HourOfDay', y='# of Crimes',
                kind='point',aspect=2, height=4,
                hue='Description', col='Description',col_wrap=2,
               sharey=False, sharex=False)
[ax.grid(axis='x') for ax in g.axes];



In [None]:
crime_perc_by_hour = df.groupby('Description')['HourOfDay'].value_counts(normalize=True).sort_index()
crime_perc_by_hour



In [None]:
## Convert to dataframe and reset index
plot_df_perc = crime_perc_by_hour.to_frame('# of Crimes').reset_index()
plot_df_perc



In [None]:
g = sns.(data=plot_df_perc, x='HourOfDay', y='# of Crimes',
                hue='Description', col='Description',col_wrap=2,
            aspect=2, height=4,
           kind='point',sharey=False,sharex=False);
[ax.grid(axis='x') for ax in g.axes];



In [None]:
# looping through the value counts series
val_counts_by_hour = df.groupby('Description')['HourOfDay'].value_counts()
val_counts_by_hour



In [None]:
## Getting all of the crimes from the first level of the index
crime_list = val_counts_by_hour.index.levels[0]
crime_list



In [None]:
##selecting a test crime and slicing the result
crime='HOMICIDE'
val_counts_by_hour.loc[crime]



In [None]:
## getting min and max values
val_counts_by_hour.loc[crime].agg(['min','max'])



In [None]:
##  Getting indices for min and max
val_counts_by_hour.loc[crime].agg(['idxmin','idxmax'])



In [None]:
g = sns.catplot(data=plot_df_perc, x='HourOfDay', y='# of Crimes',
                hue='Description', col='Description',col_wrap=2,
            aspect=2, height=4,
           kind='point',sharey=False,sharex=False);
## loop throught the axes dict
for col, ax in g.axes_dict.items():
    
    ## Extract the idx min and idxmax from the val counts
    min_, max_ = val_counts_by_hour.loc[col].agg(['idxmin','idxmax'])
    
    ## add vlines for both and SAVE THE LINE and a label to use in legend
    ln1 = ax.axvline(min_, color='darkgreen',ls='--')
    lab1 = f'Min(Hour={min_})'
    
    ln2 = ax.axvline(max_,color='darkred',ls='--')
    lab2 = f'Max(Hour={max_})'
    
    ## manually add legend using list of ln1 and ln2 and a list of labels
    ax.legend( [ln1,ln2], [lab1,lab2] )
    ## add x grid
    ax.grid(axis='x')
    
g.fig



In [None]:
crimes_by_month = df.groupby("Month")['Description'].value_counts(normalize=False)
crimes_by_month



In [None]:
jan_dec_crimes = crimes_by_month.loc[1] + crimes_by_month.loc[12]
jan_dec_crimes



In [None]:
ax = jan_dec_crimes.sort_values().plot(kind='barh',figsize=(6,6))
ax.set(title='Most Common Crimes for Dec+Jan',xlabel='# of Crimes');



In [None]:
## First, resample the data as annual 
df_year  = df.groupby('Description').resample('A').sum()['Total_Incidents'].reset_index()
df_year



In [None]:
## can pivot the result to have the datetime as the index and the crime as the columns
years_ts = df_year.pivot(index='CrimeDateTime',columns='Description')
years_ts.head()



In [None]:
years_ts.columns



In [None]:
years_ts.columns = years_ts.columns.droplevel(0)
years_ts.columns



In [None]:
## Substract to get delta crmes
delta_crime_yrs = years_ts.loc['2021'] - years_ts.loc['2018']
delta_crime_yrs



In [None]:
### Substract to get delta crmes
delta_crime_yrs = years_ts.loc['2021-12-31'] - years_ts.loc['2018-12-31']
delta_crime_yrs



In [None]:
ax = delta_crime_yrs.sort_values().plot(kind='bar',figsize=(8,4),
                                       ylabel='Change in # Crimes',
                                       title='Change in Crimes from 2018-2021')
ax.axhline(0,color='k');


In [None]:
## Substract to get delta crmes
delta_crime_perc = (delta_crime_yrs/ years_ts.loc['2018-12-31'])*100
delta_crime_perc.sort_values()



In [None]:
from matplotlib import ticker



In [None]:
ax = delta_crime_perc.sort_values().plot(kind='bar',figsize=(8,4),
                                       ylabel='% Change in  Crimes',
                                       title='% Change in Crimes from 2018-2021')
ax.axhline(0,color='k');
ax.yaxis.set_major_locator(ticker.MultipleLocator(10))
ax.yaxis.set_minor_locator(ticker.MultipleLocator(5))



In [None]:
holiday_crimes = df.groupby('US Holiday')['Description'].value_counts()
holiday_crimes = holiday_crimes.to_frame('# of Crimes')
holiday_crimes



In [None]:
## checking the values of the index (outer index is level 0)
holiday_crimes.index.levels[0]



In [None]:
holiday_crimes.loc["New Year's Day"]



In [None]:
holiday_crimes.loc["New Year's Day (Observed)"]



In [None]:
## let's try simply adding them
holiday_crimes.loc["New Year's Day"] + holiday_crimes.loc["New Year's Day (Observed)"]



In [None]:
temp = pd.concat([holiday_crimes.loc["New Year's Day"],
                  holiday_crimes.loc["New Year's Day (Observed)"]],
                 axis=1)
temp



In [None]:
## get the .sum ACROSS columns (use axis=1)
new_years_sum = temp.sum(axis=1)
new_years_sum



In [None]:
ax = new_years_sum.sort_values().plot(kind='barh',figsize=(8,6))
ax.set(title="New Years Day Crimes", xlabel='# of Crimes');



In [None]:
daily_df = df.groupby('Description').resample('D').sum()['Total_Incidents'].reset_index()
daily_df = daily_df.pivot(index='CrimeDateTime',columns='Description')
daily_df.columns = daily_df.columns.droplevel(0)
daily_df = daily_df.fillna(0)
daily_df.head()


In [None]:
## Let's calculate a total crime column
daily_df['TOTAL CRIME'] = daily_df.sum(axis=1) 
daily_df.head()


In [None]:
## making our holiday instance again
us_holidays = country_holidays('US')
us_holidays



In [None]:
## adding day of week and holiday features to daily_df
daily_df['DayOfWeek'] = daily_df.index.day_name()
daily_df['Holiday']  = daily_df.index.map(us_holidays.get)
daily_df["Holiday"] = daily_df["Holiday"].fillna("None")
daily_df.head()



In [None]:
## Making filters for group conditions
is_thanks = daily_df['Holiday'].str.contains('thanksgiving',case=False)
daily_df.loc[is_thanks]



In [None]:
is_thurs = daily_df['DayOfWeek']=='Thursday'
daily_df.loc[is_thurs & ~is_thanks]



In [None]:
## copying our dataframe for the vis
plot_df = daily_df.copy()
## using our filters and .loc to fill in a new "Group" col
plot_df.loc[is_thanks, 'Group'] = 'Thanksgiving'
plot_df.loc[~is_thanks & is_thurs, 'Group'] = 'Other Thursday'
plot_df['Group'].value_counts(dropna=False)



In [None]:
ax = sns.barplot(data=plot_df, x='Group', y='TOTAL CRIME');

