# Chicago Crime Data

Kristina DiBella-Silva

In [1]:
import pandas as pd
import numpy as np

## Filter ALL warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# CSV
df = pd.read_csv('Data/Chicago/combined_crime_data.csv.gz',lineterminator='\n')
df.head()

Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
0,5462733,01/01/2001 01:00:00 AM,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,True,233,2.0,20.0,41.789084,-87.620849,53.0,22260.0,7.0,403.0,4.0,60.0,24.0,268.0
1,1311123,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,1223,12.0,,41.854014,-87.681909,8.0,14920.0,33.0,4.0,26.0,43.0,15.0,121.0
2,1311144,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2413,24.0,,41.996666,-87.68511,42.0,4450.0,20.0,53.0,27.0,50.0,11.0,52.0
3,1311226,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,1913,19.0,,41.945072,-87.684629,13.0,21538.0,46.0,646.0,18.0,39.0,5.0,21.0
4,1311269,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,421,4.0,,41.75669,-87.561625,43.0,22538.0,39.0,245.0,37.0,24.0,19.0,225.0


In [14]:
# Missing values
df.isna().sum()

ID                            0
Date                          0
Primary Type                  0
Description                   0
Location Description          0
Arrest                        0
Domestic                      0
Beat                          0
District                      0
Ward                          9
Latitude                      6
Longitude                     6
Historical Wards 2003-2015    6
Zip Codes                     6
Community Areas               6
Census Tracts                 6
Wards                         6
Boundaries - ZIP Codes        6
Police Districts              6
Police Beats                  6
US Holiday                    0
MD Holiday                    0
Is_Holiday                    0
dtype: int64

In [16]:
# dataset shape
df.shape

(115, 23)

In [17]:
# dropping null values
df = df.dropna()

In [None]:
# dropping unnecessary columns

## Supplemental Data: Holiday Data

In [4]:
!pip install holidays

Collecting holidays
  Downloading holidays-0.29-py3-none-any.whl (695 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m695.7/695.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: holidays
Successfully installed holidays-0.29


In [5]:
import holidays
import datetime as dt
from holidays import country_holidays

In [6]:
## making a date range that covers full dataset
all_days = pd.date_range(df["Date"].min(), df["Date"].max())
all_days


DatetimeIndex(['2001-01-01 01:00:00', '2001-01-02 01:00:00',
               '2001-01-03 01:00:00', '2001-01-04 01:00:00',
               '2001-01-05 01:00:00', '2001-01-06 01:00:00',
               '2001-01-07 01:00:00', '2001-01-08 01:00:00',
               '2001-01-09 01:00:00', '2001-01-10 01:00:00',
               ...
               '2022-12-23 01:00:00', '2022-12-24 01:00:00',
               '2022-12-25 01:00:00', '2022-12-26 01:00:00',
               '2022-12-27 01:00:00', '2022-12-28 01:00:00',
               '2022-12-29 01:00:00', '2022-12-30 01:00:00',
               '2022-12-31 01:00:00', '2023-01-01 01:00:00'],
              dtype='datetime64[ns]', length=8036, freq='D')

In [7]:
# Getting General US Holidays
## Create an instance of the US country holidays.
us_holidays = country_holidays('US')
us_holidays

holidays.country_holidays('US')

In [8]:
## Testing first date
print(all_days[0])
us_holidays.get(all_days[0])

2001-01-01 01:00:00


"New Year's Day"

In [9]:
## Getting us holidays for all dates
holiday_list = [us_holidays.get(day) for day in all_days]
holiday_list[:5]

["New Year's Day", None, None, None, None]

In [10]:
# Getting Sub-Region Holidays
## For a specific subdivisions (e.g. state or province):
md_holidays = country_holidays('US', subdiv='MD')
md_holidays


holidays.country_holidays('US', subdiv='MD')

In [11]:
## Saving both holiday types as columns
df["US Holiday"] = [us_holidays.get(day) for day in df['Date']]
df['MD Holiday'] = [md_holidays.get(day) for day in df['Date']]
df.head()

Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,...,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats,US Holiday,MD Holiday
0,5462733,01/01/2001 01:00:00 AM,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,True,233,2.0,20.0,...,53.0,22260.0,7.0,403.0,4.0,60.0,24.0,268.0,New Year's Day,New Year's Day
1,1311123,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,1223,12.0,,...,8.0,14920.0,33.0,4.0,26.0,43.0,15.0,121.0,New Year's Day,New Year's Day
2,1311144,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2413,24.0,,...,42.0,4450.0,20.0,53.0,27.0,50.0,11.0,52.0,New Year's Day,New Year's Day
3,1311226,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,1913,19.0,,...,13.0,21538.0,46.0,646.0,18.0,39.0,5.0,21.0,New Year's Day,New Year's Day
4,1311269,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,421,4.0,,...,43.0,22538.0,39.0,245.0,37.0,24.0,19.0,225.0,New Year's Day,New Year's Day


In [12]:
## Saving a binary is holiday feature
df['Is_Holiday'] = df['US Holiday'].notna()
df['Is_Holiday'].value_counts()

True    115
Name: Is_Holiday, dtype: int64

# Task
Your task is to answer a series of questions about trends in crimes in Chicago for a reporter for the local newspaper.

- Stakeholder Questions to Answer (pick at least 3 topics) Select 3 or more of the following topics to analyze:

## Topic 1) Comparing Police Districts:
- Which district has the most crimes? Which has the least?

## Topic 2) Crimes Across the Years:
- Is the total number of crimes increasing or decreasing across the years?
- Are there any individual crimes that are doing the opposite (e.g decreasing when overall crime is increasing or vice-versa)?

## Topic 3) Comparing AM vs. PM Rush Hour:
- Are crimes more common during AM rush hour or PM rush hour?
    - You can consider any crime that occurred between 7 AM - 10 AM as AM rush hour
    - You can consider any crime that occurred between 4 -7 PM as PM rush hour.
- What are the top 5 most common crimes during AM rush hour? What are the top 5 most common crimes during PM rush hour?
- Are Motor Vehicle Thefts more common during AM rush hour or PM Rush Hour?

## Topic 4) Comparing Months:
- What months have the most crime? What months have the least?
- Are there any individual crimes that do not follow this pattern? If so, which crimes?

## Topic 5) Comparing Holidays:
- Are there any holidays that show an increase in the # of crimes?
- Are there any holidays that show a decrease in the # of crimes?