# Import

In [1]:
import numpy as np  
import pandas as pd  
import datetime as dt

# Read Data

## Crime Data
1. Download crime data from https://data.cityofchicago.org/browse?q=crimes&sortBy=relevancein in `csv` format
2. Use the data files from 2015 to present and combine them into one file
3. Has the information about each crime record. Key metrics can be used:

 - **Date**: Describe when the crime happend. It could be used to join `weather` data
 - **Case Number**: Unique record id for each crime. It could be used to conut number of crime per day

In [2]:
crime_2015 =  pd.read_csv('Crimes_-_2015.csv', parse_dates=['Date'])
crime_2016 =  pd.read_csv('Crimes_-_2016.csv', parse_dates=['Date'])
crime_2017 =  pd.read_csv('Crimes_-_2017.csv', parse_dates=['Date'])
crime_2018 =  pd.read_csv('Crimes_-_2018.csv', parse_dates=['Date'])
crime_2019 =  pd.read_csv('Crimes_-_2019.csv', parse_dates=['Date'])

In [3]:
# combine 5 files to 1 file
crime = crime_2015.append([crime_2016, crime_2017, crime_2018, crime_2019],
                          ignore_index=True).sort_values(by='Date').reset_index(drop=True)
crime.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
0,10272078,HY460276,2015-01-01,058XX N MULLIGAN AVE,266,CRIM SEXUAL ASSAULT,PREDATORY,RESIDENCE,False,True,...,-87.786436,"(41.987511956, -87.786435884)",20.0,22532.0,36.0,656.0,50.0,18.0,12.0,43.0
1,10231909,HY419527,2015-01-01,047XX S CHAMPLAIN AVE,1752,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,RESIDENCE,False,True,...,-87.610186,"(41.808636571, -87.610186264)",1.0,21192.0,4.0,165.0,10.0,10.0,24.0,116.0
2,11070122,JA409822,2015-01-01,027XX N LINCOLN AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BAR OR TAVERN,False,False,...,,,,,,,,,,
3,10435783,HZ173239,2015-01-01,088XX S EXCHANGE AVE,266,CRIM SEXUAL ASSAULT,PREDATORY,RESIDENCE,False,True,...,,,,,,,,,,
4,10251294,HY438848,2015-01-01,030XX W VAN BUREN ST,1562,SEX OFFENSE,AGG CRIMINAL SEXUAL ABUSE,RESIDENCE,False,False,...,-87.702855,"(41.87590938, -87.702854917)",11.0,21184.0,28.0,737.0,23.0,28.0,16.0,123.0


In [4]:
# quick check the length of the new dataframe
len(crime) == len(crime_2015) + len(crime_2016) + len(crime_2017) + len(crime_2018) + len(crime_2019)

True

In [5]:
crime_useful = crime[['Date', 'Case Number']].copy()
crime_useful.rename(columns={"Date": "timestamp", "Case Number": "case_number"}, inplace=True)

In [6]:
# quick check the null values
crime_useful.isnull().sum()

timestamp      0
case_number    0
dtype: int64

## Weather Data
1. Download weather data from https://data.cityofchicago.org/Parks-Recreation/Beach-Weather-Stations-Automated-Sensors/k7hf-8y75 in `csv` format
2. Hourly data that has the information about each weather record. Key metrics can be used:

 - **Measurement Timestamp**: the timestamp when the measurements were taken. It could be used to join `crime` data
 - **Air Temperature**: numerical degree in centigrade
 - **Total Rain**: total rain since midnight in mm
 - **Wind Speed**: wind speed in meters

In [7]:
weather = pd.read_csv('Beach_Weather_Stations_-_Automated_Sensors.csv', parse_dates=['Measurement Timestamp'])
weather = weather.sort_values('Measurement Timestamp').reset_index(drop=True)
weather.head()

Unnamed: 0,Station Name,Measurement Timestamp,Air Temperature,Wet Bulb Temperature,Humidity,Rain Intensity,Interval Rain,Total Rain,Precipitation Type,Wind Direction,Wind Speed,Maximum Wind Speed,Barometric Pressure,Solar Radiation,Heading,Battery Life,Measurement Timestamp Label,Measurement ID
0,63rd Street Weather Station,2015-04-25 09:00:00,7.0,5.9,86,7.2,5.0,5.2,60.0,119,5.1,7.1,986.1,38,354.0,12.0,04/25/2015 9:00 AM,63rdStreetWeatherStation201504250900
1,63rd Street Weather Station,2015-04-30 05:00:00,6.1,4.3,76,0.0,0.0,2.5,0.0,11,7.2,13.0,989.9,4,354.0,11.9,04/30/2015 5:00 AM,63rdStreetWeatherStation201504300500
2,Oak Street Weather Station,2015-05-22 15:00:00,,7.0,55,0.0,0.0,1.4,0.0,63,1.9,2.8,,780,322.0,12.0,05/22/2015 3:00 PM,OakStreetWeatherStation201505221500
3,Foster Weather Station,2015-05-22 16:00:00,9.17,,59,,0.0,,,4,4.0,4.4,,556,,15.1,05/22/2015 4:00 PM,FosterWeatherStation201505221600
4,Foster Weather Station,2015-05-22 17:00:00,9.28,,61,,0.0,,,40,1.2,1.7,,322,,15.1,05/22/2015 5:00 PM,FosterWeatherStation201505221700


In [8]:
weather_useful = weather[['Measurement Timestamp', 'Air Temperature', 'Total Rain', 'Maximum Wind Speed']].copy()
# rename the useful weather data
weather_useful.columns = ['timestamp', 'temp', 'total_rain', 'wind_speed']

In [9]:
# quick check the null values
weather_useful.isnull().sum()

timestamp         0
temp             75
total_rain    35704
wind_speed        0
dtype: int64

In [10]:
"""
since there are always more than 2 features have values, we won't fillna here
"""
weather_useful[(weather_useful["temp"].isnull()) & (weather_useful["total_rain"].isnull())]

Unnamed: 0,timestamp,temp,total_rain,wind_speed


# Data Transformation

## Create a `date` column for both dataset

In [11]:
crime_useful["date"] = crime_useful["timestamp"].apply(lambda x: pd.to_datetime(x.date()))
weather_useful["date"] = weather_useful["timestamp"].apply(lambda x: pd.to_datetime(x.date()))

## Keep the data with the overlapped dates

In [12]:
# check the start and end date of crime
crime_useful['date'].min(), crime_useful['date'].max()

(Timestamp('2015-01-01 00:00:00'), Timestamp('2019-08-29 00:00:00'))

In [13]:
# check the start and end date of weather
weather_useful['date'].min(), weather_useful['date'].max()

(Timestamp('2015-04-25 00:00:00'), Timestamp('2019-09-06 00:00:00'))

In [14]:
# keep both data with overlapped dates
start_date = max(crime_useful['date'].min(), weather_useful['date'].min())
end_date = min(crime_useful['date'].max(), weather_useful['date'].max())
new_weather = weather_useful.query("date >= @start_date and date <= @end_date").copy()
new_crime = crime_useful.query("date >= @start_date and date <= @end_date").copy()

## Convert the data to daily aggregation

In [15]:
# daily crime count as the target
daily_crime = new_crime.groupby('date').agg({'case_number': 'count'}).reset_index()
daily_crime.rename(columns={"case_number": "daily_crime_count"}, inplace=True)
daily_crime.head()

Unnamed: 0,date,daily_crime_count
0,2015-04-25,695
1,2015-04-26,718
2,2015-04-27,719
3,2015-04-28,718
4,2015-04-29,766


In [16]:
# daily aggregation (min, mean, max) on 3 columns as the target
daily_weather = new_weather.groupby('date').agg({'temp': ['min', 'mean', 'max'],
                                                 'total_rain': ['min', 'mean', 'max'],
                                                 'wind_speed': ['min', 'mean', 'max']}).reset_index()
weather_new_cols = [daily_weather.columns[0][0]] + ["_".join(col) for col in daily_weather.columns[1:]]
daily_weather.columns = weather_new_cols
daily_weather.head()

Unnamed: 0,date,temp_min,temp_mean,temp_max,total_rain_min,total_rain_mean,total_rain_max,wind_speed_min,wind_speed_mean,wind_speed_max
0,2015-04-25,7.0,7.0,7.0,5.2,5.2,5.2,7.1,7.1,7.1
1,2015-04-30,6.1,6.1,6.1,2.5,2.5,2.5,13.0,13.0,13.0
2,2015-05-22,9.17,9.695,10.38,1.4,1.4,1.4,1.7,2.775,4.5
3,2015-05-23,13.89,15.784,17.17,1.4,1.4,1.4,0.9,2.792593,4.9
4,2015-05-24,12.99,16.507368,24.01,1.4,4.795455,11.1,0.7,2.065854,4.5


## Merge Crime data with Weather Data

In [17]:
# inner join crime and weather data
weather_crime = daily_weather.merge(daily_crime, on='date')
weather_crime.head()

Unnamed: 0,date,temp_min,temp_mean,temp_max,total_rain_min,total_rain_mean,total_rain_max,wind_speed_min,wind_speed_mean,wind_speed_max,daily_crime_count
0,2015-04-25,7.0,7.0,7.0,5.2,5.2,5.2,7.1,7.1,7.1,695
1,2015-04-30,6.1,6.1,6.1,2.5,2.5,2.5,13.0,13.0,13.0,665
2,2015-05-22,9.17,9.695,10.38,1.4,1.4,1.4,1.7,2.775,4.5,824
3,2015-05-23,13.89,15.784,17.17,1.4,1.4,1.4,0.9,2.792593,4.9,798
4,2015-05-24,12.99,16.507368,24.01,1.4,4.795455,11.1,0.7,2.065854,4.5,687


In [18]:
len(weather_crime)

1554

In [19]:
weather_crime.isnull().sum()

date                 0
temp_min             0
temp_mean            0
temp_max             0
total_rain_min       0
total_rain_mean      0
total_rain_max       0
wind_speed_min       0
wind_speed_mean      0
wind_speed_max       0
daily_crime_count    0
dtype: int64

# Save Data

In [20]:
weather_crime.to_csv("weather_crime.csv", index=False)