# Chicago Crime Data

Kristina DiBella-Silva

In [12]:
import pandas as pd
import numpy as np

## Filter ALL warnings
import warnings
warnings.filterwarnings('ignore')

## Load in Chicago Crime Data

In [16]:
# Load in combined CSV
df = pd.read_csv('Data/Chicago/crime_data_combined.csv.gz',lineterminator='\n')
df.head()

Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
0,01/01/2001 01:00:00 AM,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,True,233,2.0,20.0,41.789084,-87.620849,53.0,22260.0,7.0,403.0,4.0,60.0,24.0,268.0
1,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,1223,12.0,,41.854014,-87.681909,8.0,14920.0,33.0,4.0,26.0,43.0,15.0,121.0
2,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2413,24.0,,41.996666,-87.68511,42.0,4450.0,20.0,53.0,27.0,50.0,11.0,52.0
3,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,1913,19.0,,41.945072,-87.684629,13.0,21538.0,46.0,646.0,18.0,39.0,5.0,21.0
4,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,421,4.0,,41.75669,-87.561625,43.0,22538.0,39.0,245.0,37.0,24.0,19.0,225.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7854582 entries, 0 to 7854581
Data columns (total 19 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   Date                        object 
 1   Primary Type                object 
 2   Description                 object 
 3   Location Description        object 
 4   Arrest                      bool   
 5   Domestic                    bool   
 6   Beat                        int64  
 7   District                    float64
 8   Ward                        float64
 9   Latitude                    float64
 10  Longitude                   float64
 11  Historical Wards 2003-2015  float64
 12  Zip Codes                   float64
 13  Community Areas             float64
 14  Census Tracts               float64
 15  Wards                       float64
 16  Boundaries - ZIP Codes      float64
 17  Police Districts            float64
 18  Police Beats                float64
dtypes: bool(2), float64(1

In [18]:
# dataset shape
df.shape

(7854582, 19)

In [19]:
# Missing values
df.isna().sum()

Date                               0
Primary Type                       0
Description                        0
Location Description           10807
Arrest                             0
Domestic                           0
Beat                               0
District                          47
Ward                          614849
Latitude                       88235
Longitude                      88235
Historical Wards 2003-2015    111220
Zip Codes                      88235
Community Areas               108091
Census Tracts                 105993
Wards                         107972
Boundaries - ZIP Codes        108038
Police Districts              106913
Police Beats                  106889
dtype: int64

In [11]:
# dropping unnecessary columns
df = df.drop(columns = ['Latitude', 'Longitude'])
df.info()

KeyError: "['Latitude', 'Longitude'] not found in axis"

### Date to datetime

In [None]:
test_date = df.loc[0, 'Date']
test_date

In [None]:
# convert dtype to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d%m%Y %I%M%S %p')

df.info()
df.head(2)

## Supplemental Data: Holiday Data

In [None]:
!pip install holidays

In [None]:
import holidays
import datetime as dt
from holidays import country_holidays

In [None]:
## making a date range that covers full dataset
all_days = pd.date_range(df["Date"].min(), df["Date"].max())
all_days


In [None]:
# Getting General US Holidays
## Create an instance of the US country holidays.
us_holidays = country_holidays('US')
us_holidays

In [None]:
## Testing first date
print(all_days[0])
us_holidays.get(all_days[0])

In [None]:
## Getting us holidays for all dates
holiday_list = [us_holidays.get(day) for day in all_days]
holiday_list[:5]

In [None]:
# Getting Sub-Region Holidays
## For a specific subdivisions (e.g. state or province):
md_holidays = country_holidays('US', subdiv='MD')
md_holidays


In [None]:
## Saving both holiday types as columns
df["US Holiday"] = [us_holidays.get(day) for day in df['Date']]
df['MD Holiday'] = [md_holidays.get(day) for day in df['Date']]
df.tail()

In [None]:
## US Holidays
df['US Holiday'].value_counts()

In [None]:
## MD Holidays
df['MD Holiday'].value_counts()

In [None]:
## Saving a binary is holiday feature
df['Is_Holiday'] = df['US Holiday'].notna()
df['Is_Holiday'].value_counts()

# Task
Your task is to answer a series of questions about trends in crimes in Chicago for a reporter for the local newspaper.

- Stakeholder Questions to Answer (pick at least 3 topics) Select 3 or more of the following topics to analyze:

## Topic 1) Comparing Police Districts:
- Which district has the most crimes? Which has the least?

## Topic 2) Crimes Across the Years:
- Is the total number of crimes increasing or decreasing across the years?
- Are there any individual crimes that are doing the opposite (e.g decreasing when overall crime is increasing or vice-versa)?

## Topic 3) Comparing AM vs. PM Rush Hour:
- Are crimes more common during AM rush hour or PM rush hour?
    - You can consider any crime that occurred between 7 AM - 10 AM as AM rush hour
    - You can consider any crime that occurred between 4 -7 PM as PM rush hour.
- What are the top 5 most common crimes during AM rush hour? What are the top 5 most common crimes during PM rush hour?
- Are Motor Vehicle Thefts more common during AM rush hour or PM Rush Hour?

## Topic 4) Comparing Months:
- What months have the most crime? What months have the least?
- Are there any individual crimes that do not follow this pattern? If so, which crimes?

## Topic 5) Comparing Holidays:
- Are there any holidays that show an increase in the # of crimes?
- Are there any holidays that show a decrease in the # of crimes?