In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('../../../data/modified_lighter_311.csv')

In [10]:
date_columns = ['open_dt','sla_target_dt','closed_dt']

for col in date_columns:
    df[col] = pd.to_datetime(df[col])

df['resolution_time'] = df['closed_dt'] - df['open_dt']

In [4]:
df.head()

Unnamed: 0,case_enquiry_id,open_dt,sla_target_dt,closed_dt,on_time,case_status,closure_reason,case_title,subject,reason,...,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_zipcode,latitude,longitude,source,resolution_time
0,101000295613,2011-06-30 21:32:33,2011-07-14 21:32:32,2011-07-01 01:06:58,ONTIME,Closed,resolved,Street Light Outages,Public Works Department,Street Lights,...,B3,Greater Mattapan,7,Ward 17,1714,2126,42.271544,-71.077221,Employee Generated,0 days 03:34:25
1,101000295614,2011-06-30 22:18:31,2011-07-04 22:18:31,2011-07-01 02:12:43,ONTIME,Closed,closed,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,...,E5,Roslindale,10,Ward 20,2008,2131,42.28385,-71.144741,Self Service,0 days 03:54:12
2,101000295615,2011-06-30 22:28:04,2011-08-14 22:28:04,2011-08-01 11:21:46,ONTIME,Closed,resolved,New Sign Crosswalk or Pavement Marking,Transportation - Traffic Division,Signs & Signals,...,E5,West Roxbury,12,Ward 20,2017,2132,42.300278,-71.169805,Self Service,31 days 12:53:42
3,101000295616,2011-06-30 23:03:48,2011-07-14 23:03:48,2011-07-22 12:13:45,OVERDUE,Closed,resolved,Street Light Outages,Public Works Department,Street Lights,...,E18,Hyde Park,10,Ward 18,1816,2136,42.24964,-71.115631,Self Service,21 days 13:09:57
4,101000295617,2011-06-30 23:12:31,NaT,2011-07-01 02:12:38,ONTIME,Closed,resolved,Highway Maintenance,Public Works Department,Highway Maintenance,...,C11,Dorchester,7,Ward 13,1309,2125,42.314436,-71.056878,Constituent Call,0 days 03:00:07


### What are we predicting?

The idea is to take the resolution time and sort it into classes.

Requests that are closed within a day, requests that are closed within a week, requests that are closed within a month, a year, and the ones that take longer

### What columns do we need for that?

<table>
    <tr>
        <th>Column Name</th>
        <th>Needed?</th>
        <th>Comments</th>
    </tr>
    <tr>
        <td>case_enquiry_id</td>
        <td>No</td>
        <td>Will keep the very end for tracking purposes</td>
    </tr>
    <tr>
        <td>open_dt</td>
        <td>No</td>
        <td>Will be needed for resolution_time column</td>
    </tr>
    <tr>
        <td>sla_target_dt</td>
        <td>No</td>
        <td></td>
    </tr>
    <tr>
        <td>closed_dt</td>
        <td>No</td>
        <td>Will be needed for resolution_time column</td>
    </tr>
    <tr>
        <td>on_time</td>
        <td>No</td>
        <td></td>
    </tr>
    <tr>
        <td>case_status</td>
        <td>No</td>
        <td></td>
    </tr>
    <tr>
        <td>closure_reason</td>
        <td>No</td>
        <td></td>
    </tr>
    <tr>
        <td>case_title</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>subject</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>reason</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>type</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>queue</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>department</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>fire_district</td>
        <td>Yes</td>
        <td>if a lot of null values, will not be used</td>
    </tr>
    <tr>
        <td>pwd_district</td>
        <td>Yes</td>
        <td>if a lot of null values, will not be used</td>
    </tr>
    <tr>
        <td>city_council_district</td>
        <td>Yes</td>
        <td>if a lot of null values, will not be used</td>
    </tr>
    <tr>
        <td>police_district</td>
        <td>Yes</td>
        <td>if a lot of null values, will not be used</td>
    </tr>
    <tr>
        <td>neighborhood</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>neighborhood_services_district</td>
        <td>Yes</td>
        <td>if a lot of null values, will not be used</td>
    </tr>
    <tr>
        <td>ward</td>
        <td>Yes</td>
        <td>if a lot of null values, will not be used</td>
    </tr>
    <tr>
        <td>precinct</td>
        <td>Yes</td>
        <td>if a lot of null values, will not be used</td>
    </tr>
    <tr>
        <td>location_zipcode</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>latitude</td>
        <td>No</td>
        <td>Variance too high, will harm the model</td>
    </tr>
    <tr>
        <td>longitude</td>
        <td>No</td>
        <td>Variance too high, will harm the model</td>
    </tr>
    <tr>
        <td>source</td>
        <td>Yes</td>
        <td></td>
    </tr>
    <tr>
        <td>resolution_time</td>
        <td>Yes</td>
        <td>Has to be fixed first, then will be used to create resolution_time_group</td>
    </tr>
</table>

Let's drop the columns that we for sure won't be needing

In [7]:
to_drop = ['on_time','case_status','closure_reason','latitude','longitude','sla_target_dt']

df.drop(to_drop, axis=1, inplace=True)

### Resolution Time Column

In [12]:
df['resolution_time'].describe()

count                        2513216
mean      31 days 07:22:48.904903912
std      159 days 01:27:55.353008794
min                -1 days +23:59:02
25%                  0 days 01:31:12
50%                  0 days 18:02:33
75%           5 days 17:05:57.250000
max               4417 days 23:00:22
Name: resolution_time, dtype: object

There is a negative value, let's see why

In [14]:
df[df.open_dt > df.closed_dt].shape

(791, 20)

791 entries, can simply drop

In [15]:
df = df[df.open_dt < df.closed_dt]

In [16]:
df[df.open_dt > df.closed_dt].shape

(0, 20)

In [17]:
df['resolution_time'].describe()

count                        2495549
mean      31 days 12:41:58.475198443
std      159 days 14:25:43.898824768
min                  0 days 00:00:01
25%                  0 days 01:35:28
50%                  0 days 18:20:59
75%                  5 days 19:29:44
max               4417 days 23:00:22
Name: resolution_time, dtype: object

In [18]:
df.head()

Unnamed: 0,case_enquiry_id,open_dt,closed_dt,case_title,subject,reason,type,queue,department,fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_zipcode,source,resolution_time
0,101000295613,2011-06-30 21:32:33,2011-07-01 01:06:58,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,8.0,7,3,B3,Greater Mattapan,7,Ward 17,1714,2126,Employee Generated,0 days 03:34:25
1,101000295614,2011-06-30 22:18:31,2011-07-01 02:12:43,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup,PWDx_Schedule a Bulk Item Pickup,PWDx,12.0,6,5,E5,Roslindale,10,Ward 20,2008,2131,Self Service,0 days 03:54:12
2,101000295615,2011-06-30 22:28:04,2011-08-01 11:21:46,New Sign Crosswalk or Pavement Marking,Transportation - Traffic Division,Signs & Signals,New Sign Crosswalk or Pavement Marking,BTDT_Engineering_New Sign and Pavement Marking...,BTDT,,6,6,E5,West Roxbury,12,Ward 20,2017,2132,Self Service,31 days 12:53:42
3,101000295616,2011-06-30 23:03:48,2011-07-22 12:13:45,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light_Pending_Contractor (Internal),PWDx,12.0,8,5,E18,Hyde Park,10,Ward 18,1816,2136,Self Service,21 days 13:09:57
4,101000295617,2011-06-30 23:12:31,2011-07-01 02:12:38,Highway Maintenance,Public Works Department,Highway Maintenance,Highway Maintenance,PWDx_District 03: North Dorchester,PWDx,7.0,3,3,C11,Dorchester,7,Ward 13,1309,2125,Constituent Call,0 days 03:00:07


## Null values

In [None]:
df.isna().sum()

case_enquiry_id                      0
open_dt                              0
closed_dt                            0
case_title                         528
subject                              0
reason                               0
type                                 0
queue                                0
department                           0
fire_district                     5085
pwd_district                      1166
city_council_district              270
police_district                    950
neighborhood                         0
neighborhood_services_district     275
ward                               160
precinct                          1543
location_zipcode                     0
source                               0
resolution_time                      0
dtype: int64

There are more than 2.5M entries, so we can drop all the rows with null columns

In [20]:
df.dropna(inplace=True)
df.isna().sum()

case_enquiry_id                   0
open_dt                           0
closed_dt                         0
case_title                        0
subject                           0
reason                            0
type                              0
queue                             0
department                        0
fire_district                     0
pwd_district                      0
city_council_district             0
police_district                   0
neighborhood                      0
neighborhood_services_district    0
ward                              0
precinct                          0
location_zipcode                  0
source                            0
resolution_time                   0
dtype: int64

In [22]:
df.head()

Unnamed: 0,case_enquiry_id,open_dt,closed_dt,case_title,subject,reason,type,queue,department,fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_zipcode,source,resolution_time
0,101000295613,2011-06-30 21:32:33,2011-07-01 01:06:58,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,8,07,3,B3,Greater Mattapan,7,Ward 17,1714,2126,Employee Generated,0 days 03:34:25
1,101000295614,2011-06-30 22:18:31,2011-07-01 02:12:43,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup,PWDx_Schedule a Bulk Item Pickup,PWDx,12,06,5,E5,Roslindale,10,Ward 20,2008,2131,Self Service,0 days 03:54:12
3,101000295616,2011-06-30 23:03:48,2011-07-22 12:13:45,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light_Pending_Contractor (Internal),PWDx,12,08,5,E18,Hyde Park,10,Ward 18,1816,2136,Self Service,21 days 13:09:57
4,101000295617,2011-06-30 23:12:31,2011-07-01 02:12:38,Highway Maintenance,Public Works Department,Highway Maintenance,Highway Maintenance,PWDx_District 03: North Dorchester,PWDx,7,03,3,C11,Dorchester,7,Ward 13,1309,2125,Constituent Call,0 days 03:00:07
5,101000295618,2011-06-30 23:43:07,2011-07-05 05:16:10,Notification,Mayor's 24 Hour Hotline,Notification,Notification,INFO09_Current Events,INFO,9,10A,6,E13,Jamaica Plain,11,Ward 10,1006,2130,Constituent Call,4 days 05:33:03


## Save the data

Since we modified the data, let's save it so we don't have to rerun everything one more time

In [23]:
df.to_csv('./data/data.csv', index=False)

In [6]:
df = pd.read_csv('./data/data.csv')

In [7]:
df['open_dt'] = pd.to_datetime(df['open_dt'])
df['closed_dt'] = pd.to_datetime(df['closed_dt'])
df['resolution_time'] = pd.to_timedelta(df['resolution_time'])

In [8]:
df.head()

Unnamed: 0,case_enquiry_id,open_dt,closed_dt,case_title,subject,reason,type,queue,department,fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_zipcode,source,resolution_time
0,101000295613,2011-06-30 21:32:33,2011-07-01 01:06:58,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,8,07,3,B3,Greater Mattapan,7,Ward 17,1714,2126,Employee Generated,0 days 03:34:25
1,101000295614,2011-06-30 22:18:31,2011-07-01 02:12:43,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup,PWDx_Schedule a Bulk Item Pickup,PWDx,12,06,5,E5,Roslindale,10,Ward 20,2008,2131,Self Service,0 days 03:54:12
2,101000295616,2011-06-30 23:03:48,2011-07-22 12:13:45,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light_Pending_Contractor (Internal),PWDx,12,08,5,E18,Hyde Park,10,Ward 18,1816,2136,Self Service,21 days 13:09:57
3,101000295617,2011-06-30 23:12:31,2011-07-01 02:12:38,Highway Maintenance,Public Works Department,Highway Maintenance,Highway Maintenance,PWDx_District 03: North Dorchester,PWDx,7,03,3,C11,Dorchester,7,Ward 13,1309,2125,Constituent Call,0 days 03:00:07
4,101000295618,2011-06-30 23:43:07,2011-07-05 05:16:10,Notification,Mayor's 24 Hour Hotline,Notification,Notification,INFO09_Current Events,INFO,9,10A,6,E13,Jamaica Plain,11,Ward 10,1006,2130,Constituent Call,4 days 05:33:03


## Encoding

Since there are a lot of entries, one-hot encoding would be impossible, so we will stick to ordinal encoding

In [22]:
to_encode = ['case_title','subject','reason','type','queue','department','fire_district','pwd_district','city_council_district','police_district','neighborhood_services_district','ward','precinct','source']

from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

df[to_encode] = encoder.fit_transform(df[to_encode])

In [23]:
df.head()

Unnamed: 0,case_enquiry_id,open_dt,closed_dt,case_title,subject,reason,type,queue,department,fire_district,...,HU100_RE,TotDis,TotChild,OlderAdult,Low_to_No,LEP,POC2,MedIllnes,Shape__Area,Shape__Length
0,101000295614,2011-06-30 22:18:31,2011-07-01 02:12:43,15695.0,10.0,40.0,153.0,157.0,17.0,4.0,...,25153,7286,13324,8231,11249,19480,33106,24029.32,39447500.0,136415.602543
1,101000295616,2011-06-30 23:03:48,2011-07-22 12:13:45,16487.0,10.0,44.0,173.0,172.0,17.0,4.0,...,6592,2117,3256,2219,2683,4902,11132,6412.46,9473516.0,33527.343089
2,101000295617,2011-06-30 23:12:31,2011-07-01 02:12:38,4973.0,10.0,25.0,61.0,115.0,17.0,9.0,...,27507,9819,17424,6535,22749,29284,50778,25010.76,22311200.0,115494.076201
3,101000295618,2011-06-30 23:43:07,2011-07-05 05:16:10,6506.0,6.0,32.0,103.0,49.0,11.0,11.0,...,18844,4570,7300,4381,12985,17366,22247,16600.57,19789670.0,92369.490007
4,101000295619,2011-07-01 01:35:41,2011-10-17 08:33:28,6328.0,11.0,42.0,90.0,28.0,4.0,6.0,...,9659,1447,571,507,6867,7374,11288,11904.82,5022871.0,30384.203454


## Social Vulnerability Index

We want to use Social Vulnerability data too to see if the correlation might help in classification

In [24]:
sv = pd.read_csv('../../../data/social_vulnerability.csv')

In [25]:
sv.head()

Unnamed: 0,FID,GEOID10,AREA_SQFT,AREA_ACRES,POP100_RE,HU100_RE,TotDis,TotChild,OlderAdult,Low_to_No,LEP,POC2,MedIllnes,Name,Shape__Area,Shape__Length
0,1,25025010405,3914567.54,89.8661,5522,994,470,60,331,1191,1522,1755,2131.22,Mission Hill,666100.0,6036.192907
1,2,25025010404,1472713.92,33.8089,5817,1862,299,77,56,2387,2443,1749,2201.14,Fenway,250612.382812,2176.592171
2,3,25025010801,1376667.12,31.6039,2783,1899,84,281,390,72,462,447,1214.76,Back Bay,234357.910156,2542.043078
3,4,25025010702,3228780.12,74.1226,2400,1643,45,86,285,187,472,320,1014.2,Back Bay,549614.007812,3224.573044
4,5,25025010204,2741497.18,62.9361,3173,1283,131,13,36,895,931,1039,1181.78,Fenway,466585.238281,3143.610637


In [26]:
# map df neighborhood to svi_df Name
mapping = {
    'Mission Hill': 'Mission Hill',
    'Fenway': 'Fenway / Kenmore / Audubon Circle / Longwood',
    'Back Bay': 'Back Bay',
    'Allston': 'Allston / Brighton',
    'Brighton': 'Allston / Brighton',
    'Jamaica Plain': 'Jamaica Plain',
    'Roslindale': 'Roslindale',
    'Hyde Park': 'Hyde Park',
    'West Roxbury': 'West Roxbury',
    'Mattapan': 'Mattapan',
    'Dorchester': 'Dorchester',
    'East Boston': 'East Boston',
    'North End': 'Downtown / Financial District',
    'West End': 'Downtown / Financial District',
    'Longwood Medical Area': 'Fenway / Kenmore / Audubon Circle / Longwood',
    'Roxbury': 'Roxbury',
    'South Boston Waterfront': 'South Boston / South Boston Waterfront',
    'Charlestown': 'Charlestown',
    'South End': 'South End',
    'Bay Village': 'South End',
    'Leather District': 'Downtown / Financial District',
    'South Boston': 'South Boston / South Boston Waterfront',
    'Harbor Islands': 'South Boston / South Boston Waterfront'    
}
sv['neighborhood'] = sv['Name'].map(mapping)
sv['neighborhood'].unique()

array(['Mission Hill', 'Fenway / Kenmore / Audubon Circle / Longwood',
       'Back Bay', 'Allston / Brighton', 'Jamaica Plain', 'Roslindale',
       'Hyde Park', 'West Roxbury', 'Mattapan', 'Dorchester',
       'East Boston', 'Downtown / Financial District', 'Roxbury',
       'South Boston / South Boston Waterfront', 'Charlestown',
       'South End'], dtype=object)

Map the neighborhoods so we could then merge them

In [27]:
sv.drop('Name', axis=1, inplace=True)

sv = sv.groupby('neighborhood').sum()

sv.head()

Unnamed: 0_level_0,FID,GEOID10,AREA_SQFT,AREA_ACRES,POP100_RE,HU100_RE,TotDis,TotChild,OlderAdult,Low_to_No,LEP,POC2,MedIllnes,Shape__Area,Shape__Length
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Allston / Brighton,1122,400400007531,105838100.0,2429.7088,68413,30966,5902,4417,5991,20367,26358,22779,26768.65,18015280.0,84682.245858
Back Bay,597,275277067915,34093610.0,782.6818,28634,18365,1892,1686,3759,5316,9075,6923,11842.96,5803907.0,67191.048486
Charlestown,821,150150242402,37683440.0,865.0927,16439,8648,1535,3301,1811,4157,5968,3981,6461.47,6420148.0,47678.443687
Dorchester,1849,475477636009,131273500.0,3013.624,69695,27507,9819,17424,6535,22749,29284,50778,25010.76,22311200.0,115494.076201
Downtown / Financial District,957,250250302707,32849540.0,754.1216,29635,18598,2267,1960,3706,5820,9526,6735,12423.27,5593130.0,41899.659344


Drop columns that are not important

In [28]:
sv.drop(['AREA_ACRES', 'GEOID10', 'FID'], axis=1, inplace=True)

In [29]:
neighborhoods_from_df = df.neighborhood.unique()
neighborhoods_from_sv = sv.index

not_in_sv = []
# compare the two lists
for neighborhood in neighborhoods_from_df:
    if neighborhood not in neighborhoods_from_sv:
        print(f'Neighborhood {neighborhood} not found in sv dataset')
        not_in_sv.append(neighborhood)

In [31]:
df = df[~df['neighborhood'].isin(not_in_sv)]

For the sake of simplicity, we are dropping neighborhoods that are not in the SV dataset

In [32]:
df = df.merge(sv, how='left', left_on='neighborhood', right_on='neighborhood')

In [37]:
df.head()

Unnamed: 0,case_enquiry_id,open_dt,closed_dt,case_title,subject,reason,type,queue,department,fire_district,...,HU100_RE_y,TotDis_y,TotChild_y,OlderAdult_y,Low_to_No_y,LEP_y,POC2_y,MedIllnes_y,Shape__Area_y,Shape__Length_y
0,101000295614,2011-06-30 22:18:31,2011-07-01 02:12:43,15695.0,10.0,40.0,153.0,157.0,17.0,4.0,...,25153,7286,13324,8231,11249,19480,33106,24029.32,39447500.0,136415.602543
1,101000295616,2011-06-30 23:03:48,2011-07-22 12:13:45,16487.0,10.0,44.0,173.0,172.0,17.0,4.0,...,6592,2117,3256,2219,2683,4902,11132,6412.46,9473516.0,33527.343089
2,101000295617,2011-06-30 23:12:31,2011-07-01 02:12:38,4973.0,10.0,25.0,61.0,115.0,17.0,9.0,...,27507,9819,17424,6535,22749,29284,50778,25010.76,22311200.0,115494.076201
3,101000295618,2011-06-30 23:43:07,2011-07-05 05:16:10,6506.0,6.0,32.0,103.0,49.0,11.0,11.0,...,18844,4570,7300,4381,12985,17366,22247,16600.57,19789670.0,92369.490007
4,101000295619,2011-07-01 01:35:41,2011-10-17 08:33:28,6328.0,11.0,42.0,90.0,28.0,4.0,6.0,...,9659,1447,571,507,6867,7374,11288,11904.82,5022871.0,30384.203454


## Normalize the data

In [38]:
# Normalize all the columns except case_enquiry_id, open_dt, closed_dt, and resolution_time and neighrborhood
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

to_normalize = [col for col in df.columns if col not in ['case_enquiry_id','open_dt','closed_dt','resolution_time','neighborhood']]
df[to_normalize] = scaler.fit_transform(df[to_normalize])

df.head()

Unnamed: 0,case_enquiry_id,open_dt,closed_dt,case_title,subject,reason,type,queue,department,fire_district,...,HU100_RE_y,TotDis_y,TotChild_y,OlderAdult_y,Low_to_No_y,LEP_y,POC2_y,MedIllnes_y,Shape__Area_y,Shape__Length_y
0,101000295614,2011-06-30 22:18:31,2011-07-01 02:12:43,0.758213,0.909091,0.8,0.746341,0.882022,1.0,0.363636,...,0.705876,0.52302,0.673124,1.0,0.280035,0.415979,0.460576,0.807274,1.0,1.0
1,101000295616,2011-06-30 23:03:48,2011-07-22 12:13:45,0.796473,0.909091,0.88,0.843902,0.966292,1.0,0.363636,...,0.0,0.060014,0.141719,0.221647,0.0,0.0,0.113084,0.0,0.157922,0.064155
2,101000295617,2011-06-30 23:12:31,2011-07-01 02:12:38,0.240242,0.909091,0.5,0.297561,0.646067,1.0,0.818182,...,0.795398,0.74991,0.889528,0.780425,0.655987,0.695734,0.740037,0.852248,0.518579,0.809703
3,101000295618,2011-06-30 23:43:07,2011-07-05 05:16:10,0.3143,0.545455,0.64,0.502439,0.275281,0.647059,1.0,...,0.465944,0.279738,0.355167,0.501554,0.336788,0.355657,0.288854,0.46686,0.44774,0.599368
4,101000295619,2011-07-01 01:35:41,2011-10-17 08:33:28,0.3057,1.0,0.84,0.439024,0.157303,0.235294,0.545455,...,0.116638,0.0,0.0,0.0,0.136781,0.070538,0.115551,0.251682,0.032887,0.035566


Now we can remove the columns we do not need

In [39]:
# now drop the open_dt and closed_dt columns
df.drop(['open_dt','closed_dt'], axis=1, inplace=True)

In [43]:
# save the dataframe
df.to_csv('./data/merged_cleand.csv', index=False)

Saving the model to save some time

In [26]:
df = pd.read_csv('./data/merged_cleand.csv')

In [27]:
df['resolution_time'] = pd.to_timedelta(df['resolution_time'])

In [28]:
labels = ['day','week','month','year','> year']
bins = [-1,1,7,31,365, np.inf]

df['resolution_time_group'] = pd.cut(df['resolution_time'].dt.days, bins=bins, labels=labels)

In [29]:
df.head()

Unnamed: 0,case_enquiry_id,case_title,subject,reason,type,queue,department,fire_district,pwd_district,city_council_district,...,TotDis_y,TotChild_y,OlderAdult_y,Low_to_No_y,LEP_y,POC2_y,MedIllnes_y,Shape__Area_y,Shape__Length_y,resolution_time_group
0,101000295614,0.758213,0.909091,0.8,0.746341,0.882022,1.0,0.363636,0.238095,0.6,...,0.52302,0.673124,1.0,0.280035,0.415979,0.460576,0.807274,1.0,1.0,day
1,101000295616,0.796473,0.909091,0.88,0.843902,0.966292,1.0,0.363636,0.333333,0.6,...,0.060014,0.141719,0.221647,0.0,0.0,0.113084,0.0,0.157922,0.064155,month
2,101000295617,0.240242,0.909091,0.5,0.297561,0.646067,1.0,0.818182,0.095238,0.4,...,0.74991,0.889528,0.780425,0.655987,0.695734,0.740037,0.852248,0.518579,0.809703,day
3,101000295618,0.3143,0.545455,0.64,0.502439,0.275281,0.647059,1.0,0.428571,0.7,...,0.279738,0.355167,0.501554,0.336788,0.355657,0.288854,0.46686,0.44774,0.599368,week
4,101000295619,0.3057,1.0,0.84,0.439024,0.157303,0.235294,0.545455,0.428571,0.9,...,0.0,0.0,0.0,0.136781,0.070538,0.115551,0.251682,0.032887,0.035566,year


Finally, encode the neighborhood

In [30]:
from sklearn.preprocessing import OrdinalEncoder
# encode the neighborhood column

encoder = OrdinalEncoder()

df['neighborhood'] = encoder.fit_transform(df[['neighborhood']])

df.head()

Unnamed: 0,case_enquiry_id,case_title,subject,reason,type,queue,department,fire_district,pwd_district,city_council_district,...,TotDis_y,TotChild_y,OlderAdult_y,Low_to_No_y,LEP_y,POC2_y,MedIllnes_y,Shape__Area_y,Shape__Length_y,resolution_time_group
0,101000295614,0.758213,0.909091,0.8,0.746341,0.882022,1.0,0.363636,0.238095,0.6,...,0.52302,0.673124,1.0,0.280035,0.415979,0.460576,0.807274,1.0,1.0,day
1,101000295616,0.796473,0.909091,0.88,0.843902,0.966292,1.0,0.363636,0.333333,0.6,...,0.060014,0.141719,0.221647,0.0,0.0,0.113084,0.0,0.157922,0.064155,month
2,101000295617,0.240242,0.909091,0.5,0.297561,0.646067,1.0,0.818182,0.095238,0.4,...,0.74991,0.889528,0.780425,0.655987,0.695734,0.740037,0.852248,0.518579,0.809703,day
3,101000295618,0.3143,0.545455,0.64,0.502439,0.275281,0.647059,1.0,0.428571,0.7,...,0.279738,0.355167,0.501554,0.336788,0.355657,0.288854,0.46686,0.44774,0.599368,week
4,101000295619,0.3057,1.0,0.84,0.439024,0.157303,0.235294,0.545455,0.428571,0.9,...,0.0,0.0,0.0,0.136781,0.070538,0.115551,0.251682,0.032887,0.035566,year


In [32]:
# drop resolution_time
df.drop('resolution_time', axis=1, inplace=True)

In [33]:
# drop case_enquiry_id
df.drop('case_enquiry_id', axis=1, inplace=True)

## Save final result

In [52]:
df.to_csv('./data/training_data.csv', index=False)