In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
import datetime

%matplotlib inline

In [2]:
df = pd.read_csv('./Data/Police_Calls_For_Service.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print(df.shape)

(1407125, 16)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407125 entries, 0 to 1407124
Data columns (total 16 columns):
Incident Number            1407125 non-null object
Call Type                  1390119 non-null object
Incident Date              1407125 non-null object
Location                   1405134 non-null object
Beat                       1406999 non-null object
Reporting District         1406762 non-null object
Received Time              1407124 non-null object
Cleared Time               1403566 non-null object
Disposition                1403279 non-null object
Latitude                   1347999 non-null float64
Longitude                  1347999 non-null float64
Map Point                  1347999 non-null object
Census Block 2000 GeoId    524646 non-null float64
Census Tract 2000 GeoId    524646 non-null float64
Census Block 2010 GeoId    858647 non-null float64
Census Tract 2010 GeoId    858647 non-null float64
dtypes: float64(6), object(10)
memory usage: 171.8+ MB


In [5]:
# drop  Census GeoId columns since they are hard to use and location related information should be sufficient, 
# also drop map point column since we already have latitude&longtitude
df_clean = df.drop(['Map Point','Census Block 2000 GeoId','Census Tract 2000 GeoId',
                    'Census Block 2010 GeoId','Census Tract 2010 GeoId'],axis = 1)

### create features for datetime data

In [6]:
df_clean.isnull().sum()

Incident Number           0
Call Type             17006
Incident Date             0
Location               1991
Beat                    126
Reporting District      363
Received Time             1
Cleared Time           3559
Disposition            3846
Latitude              59126
Longitude             59126
dtype: int64

In [7]:
# mark if reveived time and cleared time have the same value already
df_clean['time mark'] = (df_clean['Received Time'] == df_clean['Cleared Time'])

In [8]:
#replace the null "received time" data with "cleared time" data
df_clean.loc[df_clean['Received Time'].isnull(),'Received Time']= df_clean.loc[df_clean['Received Time'].isnull(),'Cleared Time'].copy()

In [9]:
#replace the null "cleared time" data with "received time" data
df_clean.loc[df_clean['Cleared Time'].isnull(),'Cleared Time'] = df_clean.loc[df_clean['Cleared Time'].isnull(),'Received Time'].copy()

In [10]:
# create datatime format for "received time" and "cleared time"
df_clean['received time'] =df_clean['Received Time'].apply(lambda x: datetime.datetime.strptime(x,'%m/%d/%Y %X %p'))
df_clean['cleared time'] =df_clean['Cleared Time'].apply(lambda x: datetime.datetime.strptime(x,'%m/%d/%Y %X %p'))

In [11]:
# create flag for line open or not
open_date = datetime.datetime(2016, 5, 20)
df_clean['Line_Open'] = df_clean['received time'] >= open_date

# add month, year, day of week features
df_clean['Month'] = df_clean['received time'].dt.month
df_clean['Year'] = df_clean['received time'].dt.year
df_clean['Day_Of_Week'] = df_clean['received time'].dt.dayofweek

# create delta between crime and open
df_clean['Day_Delta'] = (df_clean['received time'] - open_date).dt.days
# create solving time (unit: min) between received time and cleared time, replace 0 with nan
df_clean['Solving Time'] = (df_clean["cleared time"] - df_clean['received time']).apply(lambda x: round(x.seconds/60,2))


In [12]:
df_clean.loc[(df_clean['time mark']==False) & (df_clean['Solving Time'] == 0),'Solving Time'] = np.NaN

In [13]:
df_clean.isnull().sum()

Incident Number           0
Call Type             17006
Incident Date             0
Location               1991
Beat                    126
Reporting District      363
Received Time             0
Cleared Time              0
Disposition            3846
Latitude              59126
Longitude             59126
time mark                 0
received time             0
cleared time              0
Line_Open                 0
Month                     0
Year                      0
Day_Of_Week               0
Day_Delta                 0
Solving Time           3560
dtype: int64

In [14]:
#check if incident date is the same date of received date
df_clean['incident date'] =df_clean['Incident Date'].apply(lambda x: datetime.datetime.strptime(x,'%m/%d/%Y'))
df_clean['delayed report'] = (df_clean['incident date'].apply(lambda x: x.date()) 
                             == df_clean['received time'].apply(lambda x: x.date()))
df_clean['delayed report'].value_counts()

True    1407125
Name: delayed report, dtype: int64

In [15]:
#drop incident date, time mark, cleared time, delayed report 
df_clean = df_clean.drop(['Incident Date','Cleared Time','delayed report'],axis = 1)

In [16]:
df_clean.isnull().sum()

Incident Number           0
Call Type             17006
Location               1991
Beat                    126
Reporting District      363
Received Time             0
Disposition            3846
Latitude              59126
Longitude             59126
time mark                 0
received time             0
cleared time              0
Line_Open                 0
Month                     0
Year                      0
Day_Of_Week               0
Day_Delta                 0
Solving Time           3560
incident date             0
dtype: int64

### categorize call type

#### .../Data/call type.csv is used to categorize 255 different call types in the dataset. Two extra columns are added to mark if the call is "violent" related or "larceny" related. Different from the data in crime dataset, there is no call type directly named "larceny". Therefore, types such as "theft, GTA, burglary" are treated as "larceny" related. 

In [30]:
df_call = pd.read_csv('./Data/call_type.csv')

In [31]:
df_call.columns

Index(['Call Type', 'Count', 'Violent', 'Larceny', 'Check'], dtype='object')

In [39]:
violent = dict(zip(df_call['Call Type'],df_call['Violent']))
violent[np.nan] = np.nan
df_clean['Violent'] = df_clean['Call Type'].apply(lambda x: violent[x])

In [40]:
larceny = dict(zip(df_call['Call Type'],df_call['Larceny']))
larceny[np.nan] = np.nan
df_clean['Larceny'] = df_clean['Call Type'].apply(lambda x: larceny[x])

#### check column is used to mark if the call type is interesting to check for this project 

In [42]:
check = dict(zip(df_call['Call Type'],df_call['Check']))
check[np.nan] = 0
df_clean['Check'] = df_clean['Call Type'].apply(lambda x: check[x])

In [43]:
df_clean.loc[0]

Incident Number                    130023841
Call Type                         Loud Music
Location                     3000BLK MAIN ST
Beat                                    A002
Reporting District                      02D1
Received Time         03/01/2013 12:00:40 AM
Disposition                          Advisal
Latitude                             33.9983
Longitude                            -118.48
time mark                              False
received time            2013-03-01 12:00:40
cleared time             2013-03-01 12:09:57
Line_Open                              False
Month                                      3
Year                                    2013
Day_Of_Week                                4
Day_Delta                              -1176
Solving Time                            9.28
incident date            2013-03-01 00:00:00
Violent                                    0
Larceny                                    0
Check                                      0
Name: 0, d

#### convert dataframe into pickle file

In [48]:
import pickle

In [50]:
df_1,df_2,df_3 = np.array_split(df_clean,3)

In [51]:
with open('./Data/cleaned_call_for_service_1','wb') as f:
    pickle.dump(df_1,f)

In [52]:
with open('./Data/cleaned_call_for_service_2','wb') as f:
    pickle.dump(df_2,f)

In [53]:
with open('./Data/cleaned_call_for_service_3','wb') as f:
    pickle.dump(df_3,f)