# Separated analysis of the delays and closures of streetcars
<a id="gotop"></a>

### Preparation

In [1]:
# importing modules
import numpy as np
import pandas as pd

In [2]:
# getting the data all together using a for loop 
df_list = list()
for i in range(12):
    data = pd.read_excel("/Users/laurabresson/Data_projects/Is_TTC_really_bad?/raw_data/streetcar-data2019.xlsx", sheet_name= i)
    df_list.append(data)
    
data2019 = pd.concat(df_list)
data2019.sample(10)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Incident ID,Delay,Gap
112,2019-07-05,501,15:59:00,Friday,Queen and York,Investigation,8.0,15.0,E/B,4501.0,,,
475,2019-08-17,511,12:25:00,Saturday,CNE Loop,Investigation,4.0,8.0,W/B,4443.0,,,
1291,2019-01-28,506,17:45:00,Monday,gerrard/broadview,Investigation,21.0,26.0,E/B,4119.0,,,
770,2019-06-25,512,08:54:00,Tuesday,St.Clair West Stn.,Overhead - Pantograph,,,E/B,4436.0,,115.0,120.0
654,2019-10-23,504,05:43:00,Wednesday,Broadview and Queen,Held By,10.0,18.0,W/B,4545.0,,,
51,2019-03-02,501,04:45:00,Saturday,Russell Yard,Mechanical,10.0,20.0,W/B,4181.0,,,
518,2019-12-19,512,06:21:00,Thursday,Bathurst and St.Clair,Mechanical,,,E/B,4539.0,,4.0,8.0
559,2019-12-20,505,13:36:00,Friday,Brock Road,General Delay,,,W/B,1417.0,,15.0,20.0
1235,2019-02-26,506,07:26:00,Tuesday,Queen/Coxwell,Mechanical,7.0,10.0,E/B,4020.0,,,
550,2019-09-21,501,00:05:00,Saturday,Augusta and Queen,Investigation,81.0,91.0,E/B,4446.0,,,


The readme page to explain the columns

<img src="streetcar-readme.png"/>

------
------

### Data diagnosis - NaN values

In [3]:
data2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11882 entries, 0 to 814
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Report Date  11882 non-null  datetime64[ns]
 1   Route        11882 non-null  int64         
 2   Time         11882 non-null  object        
 3   Day          11882 non-null  object        
 4   Location     11858 non-null  object        
 5   Incident     11882 non-null  object        
 6   Min Delay    8410 non-null   float64       
 7   Min Gap      8388 non-null   float64       
 8   Direction    11764 non-null  object        
 9   Vehicle      11659 non-null  float64       
 10  Incident ID  889 non-null    float64       
 11  Delay        3444 non-null   float64       
 12  Gap          3434 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(5)
memory usage: 1.3+ MB


In [4]:
# percentage of nan per columns
print((data2019.isna().sum() / data2019.shape[0]) * 100)

Report Date     0.000000
Route           0.000000
Time            0.000000
Day             0.000000
Location        0.201986
Incident        0.000000
Min Delay      29.220670
Min Gap        29.405824
Direction       0.993099
Vehicle         1.876788
Incident ID    92.518095
Delay          71.014981
Gap            71.099142
dtype: float64


Based on this analysis of the nan, what are the columns in special need of attention ? 

- Min Delay & Min Gap
- Delay & Gap
- Incident ID --> shouldn't it be deleted ? 

Okay first let's take care of Incident ID. 92% of nan values is just offensive.

In [5]:
# dropping Incident ID column
data2019.drop(columns= ['Incident ID'], inplace= True)
data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Delay,Gap
1084,2019-03-30,501,02:51:00,Saturday,Long branch Loop,Mechanical,9.0,18.0,E/B,4451.0,,
552,2019-02-12,512,23:26:00,Tuesday,Gunns Loop,Mechanical,6.0,12.0,E/B,4514.0,,
781,2019-12-31,306,03:19:00,Tuesday,Gerrard and Broadview,Mechanical,,,E/B,4413.0,10.0,20.0
420,2019-01-11,505,14:57:00,Friday,Sackville and Dundas,Mechanical,4.0,8.0,E/B,8156.0,,
82,2019-02-02,504,05:26:00,Saturday,Leslie Barns Yard,Late Leaving Garage,1.0,,W/B,4496.0,,


Dropped as expected.

Second on the list merging gap and delay because merging these 2 pairs of columns would give us all the information we need.

In [6]:
# merging Gap & Min Gap columns 
# Thanks Andrew for reminding me of lookups and np.where

data2019["Gaps"] = np.where(data2019["Gap"].isna(), data2019["Min Gap"], data2019["Gap"])
# create column so that if Gap has a NaN lookup into Min Gap

data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Delay,Gap,Gaps
485,2019-06-15,501,17:15:00,Saturday,Brooklyn and Queen,Mechanical,,,W/B,4004.0,4.0,8.0,8.0
469,2019-06-15,512,08:57:00,Saturday,St. Clair and Avenue Rd.,General Delay,,,E/B,4439.0,10.0,9.0,9.0
61,2019-09-03,512,14:56:00,Tuesday,Bathurst and St. Clair,General Delay,5.0,9.0,N/B,4523.0,,,9.0
140,2019-10-05,506,10:35:00,Saturday,Canaught and Queen,Mechanical,1.0,2.0,E/B,4184.0,,,2.0
97,2019-11-05,506,11:00:00,Tuesday,Carlton and Gerrard,General Delay,,,B/W,4534.0,9.0,8.0,8.0


In [7]:
# merging Delay & Min Delay
data2019["Delays"] = np.where(data2019["Delay"].isna(), data2019["Min Delay"], data2019["Delay"])
# create a new column such as when "Delay" has a NaN lookup to "Min Delay", otherwise keep it as is 

# checking if this worked 
data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Delay,Gap,Gaps,Delays
780,2019-10-27,504,01:58:00,Sunday,Roncesvalles and Howard Park,Emergency Services,15.0,30.0,S/B,4495.0,,,30.0,15.0
473,2019-04-16,505,09:03:00,Tuesday,Shaw and Dundas,Emergency Services,,,E/B,8378.0,4.0,8.0,8.0,4.0
755,2019-08-29,506,10:48:00,Thursday,Gerrard River,Investigation,1.0,1.0,E/B,4074.0,,,1.0,1.0
95,2019-05-04,501,07:39:00,Saturday,Queen/University,Mechanical,24.0,36.0,W/B,4146.0,,,36.0,24.0
314,2019-07-13,304,05:54:00,Saturday,Queen/Leslie,Emergency Services,1.0,1.0,E/B,4420.0,,,1.0,1.0


In [8]:
# doing a sanity check on nan values after this 
data2019.isna().sum() / data2019.shape[0] * 100

Report Date     0.000000
Route           0.000000
Time            0.000000
Day             0.000000
Location        0.201986
Incident        0.000000
Min Delay      29.220670
Min Gap        29.405824
Direction       0.993099
Vehicle         1.876788
Delay          71.014981
Gap            71.099142
Gaps            0.504965
Delays          0.235651
dtype: float64

Going from 70%-30% to under 1% is already quite the feat. 

But is it possible to go all the way to 0 ? 

Since NaN are now under 1%, there wouldn't be too much change in data if NaN were filled with the mean of the columns.

In [9]:
gaps_mean = data2019.Gaps.mean()
delays_mean = data2019.Delays.mean()
print(gaps_mean, delays_mean)

19.554897648452037 13.981862662392441


In [10]:
data2019['Gaps'].fillna(value= data2019.Gaps.mean(), inplace= True)
data2019.Delays.fillna(value= data2019.Delays.mean(), inplace= True)

In [11]:
data2019.isna().sum()

Report Date       0
Route             0
Time              0
Day               0
Location         24
Incident          0
Min Delay      3472
Min Gap        3494
Direction       118
Vehicle         223
Delay          8438
Gap            8448
Gaps              0
Delays            0
dtype: int64

With the NaN gone, the original columns now about gaps and delays can be deleted. 

In [12]:
data2019.drop(columns= ['Min Delay', 'Min Gap', 'Delay', 'Gap'], inplace= True)
data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Vehicle,Gaps,Delays
352,2019-03-08,504,08:27:00,Friday,KING/STRACHAN,Mechanical,E/B,4411.0,14.0,7.0
41,2019-04-02,509,06:04:00,Tuesday,CNE Loop,Mechanical,E/B,4479.0,16.0,8.0
87,2019-08-04,304,03:29:00,Sunday,King and Bathurst,Mechanical,W/B,4415.0,30.0,15.0
188,2019-07-08,501,22:23:00,Monday,Queen and Spadina,Mechanical,W/B,4573.0,14.0,7.0
178,2019-09-07,505,15:01:00,Saturday,Broadview station,Mechanical,E/B,1353.0,8.0,4.0


Dropped as expected and now the df is just that step closer to cleanliness. 

Next on the list of nan values is Vehicle.

Do I keep it ?

Do I not keep it ? 

In [13]:
data2019.isna().sum() / data2019.shape[0] * 100 

Report Date    0.000000
Route          0.000000
Time           0.000000
Day            0.000000
Location       0.201986
Incident       0.000000
Direction      0.993099
Vehicle        1.876788
Gaps           0.000000
Delays         0.000000
dtype: float64

The NaN alone wouldn't justify getting rid of the entire column. 

So let's make a judgment call based on what information is contained in this column.

In [14]:
# what kind of information does Vehicle tell us ? 
data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Vehicle,Gaps,Delays
1061,2019-03-29,301,01:02:00,Friday,Queen and Carlaw,Mechanical,W/B,4021.0,18.0,9.0
291,2019-10-10,512,15:32:00,Thursday,Bathurst and St.Clair,Held By,W/B,4527.0,9.0,5.0
137,2019-01-05,504,08:11:00,Saturday,Broadview Stn,Mechanical,W/B,4465.0,18.0,9.0
461,2019-09-17,501,15:30:00,Tuesday,Queen/Roncesvalles,Mechanical,W/B,4458.0,15.0,8.0
71,2019-08-03,504,15:08:00,Saturday,King and Bay,Held By,B/W,4455.0,226.0,222.0


The readme says it's the vehicle number. I'm not sure i'm interested in that kind of information. 

So let's drop the entire column, instead of the rows with nan.

In [15]:
data2019.drop(columns= ['Vehicle'], inplace= True)
data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Gaps,Delays
615,2019-05-23,506,17:03:00,Thursday,Mccaul and College,Mechanical,E/B,51.0,45.0
946,2019-03-25,501,11:30:00,Monday,Queen & Ossington,Investigation,E/B,23.0,19.0
150,2019-07-07,501,12:54:00,Sunday,Russell Yard,Mechanical,E/B,12.0,6.0
1133,2019-02-23,505,18:05:00,Saturday,Dundas and Sherbourne,Investigation,E/B,10.0,5.0
349,2019-09-13,501,08:08:00,Friday,Leslie and Queen,Held By,W/B,37.0,30.0


Dropped as expected.

Next is Location --> what's going on there ? 

In [16]:
data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Gaps,Delays
609,2019-02-14,504,05:23:00,Thursday,Leslie Barns,Late Leaving Garage,W/B,8.0,4.0
682,2019-08-26,501,09:02:00,Monday,Queen / Church,Mechanical,E/B,16.0,8.0
535,2019-02-12,504,16:32:00,Tuesday,Sumach/Front,Mechanical,W/B,20.0,13.0
66,2019-02-01,504,23:03:00,Friday,King/Parliament,Held By,W/B,15.0,7.0
723,2019-10-25,501,12:13:00,Friday,Queen & Spadina,Investigation,E/B,21.0,14.0


Location is an intersection - it's interesting information so it is worth keeping (even though I don't have any interest for it right now).

With the nature of this, NaN means the location is unknown so instead of a NaN, let's say Unknown. 

In [17]:
# how many NaN are there ? 
data2019.isna().sum()

Report Date      0
Route            0
Time             0
Day              0
Location        24
Incident         0
Direction      118
Gaps             0
Delays           0
dtype: int64

24 NaN values in this column.

In [18]:
# NaN can't be filled with a string right away so 
# 1. fill them with an actual numerical value 
data2019.Location.fillna(value= 0, inplace= True)

# 2. replace that numerical value with a string 
data2019.Location.replace(to_replace= 0, value= 'Unknown', inplace= True)

# check to see if this worked
(data2019.Location == 'Unknown').sum()

24

Which is identical to a previous number of nan values in Location. 

ü§©üíÉ

Next on the list ???

In [19]:
data2019.isna().sum()

Report Date      0
Route            0
Time             0
Day              0
Location         0
Incident         0
Direction      118
Gaps             0
Delays           0
dtype: int64

Directions

This one promises some trickiness. 

In [20]:
print(data2019.Direction.nunique())
data2019.Direction.unique()

31


array(['E/B', 'N/B', 'W/B', 'S/B', 'B/W', '26', nan, 'EB', 'EW', 'wb',
       'eb', 'WB', '1573', 'nb', 'NB', 'SB', 'w/b', '-', 'sb', 'bw',
       '31087', 'NBN', 'BW', 'Eb', '9', 'w', 'WN', '6', '3', 'E', 'W',
       'e/b'], dtype=object)

31 directions. 

That's 26 more than necessary - Northbound/Southbound, Eastbound/Westbound and Unknown. 

In [21]:
# okay let's try first to replace digits by Unknown
data2019['Direction'].replace(to_replace= '\d+|\-', value= 'Unknown', \
                              regex= True, inplace= True)
data2019.Direction.unique()

array(['E/B', 'N/B', 'W/B', 'S/B', 'B/W', 'Unknown', nan, 'EB', 'EW',
       'wb', 'eb', 'WB', 'nb', 'NB', 'SB', 'w/b', 'sb', 'bw', 'NBN', 'BW',
       'Eb', 'w', 'WN', 'E', 'W', 'e/b'], dtype=object)

That's already a bit better.

In [22]:
# now doing regex for what is easily understood as Northbound 
data2019.Direction.replace(to_replace= '[Nn]\/?[Bb]\w?', value= 'Northbound',\
                          regex= True, inplace= True)
data2019.Direction.unique()

array(['E/B', 'Northbound', 'W/B', 'S/B', 'B/W', 'Unknown', nan, 'EB',
       'EW', 'wb', 'eb', 'WB', 'SB', 'w/b', 'sb', 'bw', 'BW', 'Eb', 'w',
       'WN', 'E', 'W', 'e/b'], dtype=object)

In [23]:
# now doing regex for Southbound
data2019.Direction.replace(to_replace= '[Ss]\/?[Bb]\w?', value= 'Southbound',\
                          regex= True, inplace= True)
data2019.Direction.unique()

array(['E/B', 'Northbound', 'W/B', 'Southbound', 'B/W', 'Unknown', nan,
       'EB', 'EW', 'wb', 'eb', 'WB', 'w/b', 'bw', 'BW', 'Eb', 'w', 'WN',
       'E', 'W', 'e/b'], dtype=object)

In [24]:
# Westbound
data2019.Direction.replace(to_replace= '[Ww]\/?[Bb]|[Bb]\/?[Ww]', value= 'Westbound',\
                          regex= True, inplace= True)
data2019.Direction.unique()

array(['E/B', 'Northbound', 'Westbound', 'Southbound', 'Unknown', nan,
       'EB', 'EW', 'eb', 'Eb', 'w', 'WN', 'E', 'W', 'e/b'], dtype=object)

In [25]:
# Eastbound 
data2019.Direction.replace(to_replace= '[Ee]\/?[Bb]', value= 'Eastbound',\
                          regex= True, inplace= True)
data2019.Direction.unique()

array(['Eastbound', 'Northbound', 'Westbound', 'Southbound', 'Unknown',
       nan, 'EW', 'w', 'WN', 'E', 'W'], dtype=object)

There are still a few values that weren't caught by the regex and that will need to be taken care of manually. 

In [26]:
# replacing more values that seem to be Eastbound
data2019.Direction.replace(to_replace= 'E', value= 'Eastbound', inplace= True)

# doing the remaining Westbound
data2019.Direction.replace(to_replace= 'W', value= 'Westbound', inplace= True)
data2019.Direction.replace(to_replace= 'w', value= 'Westbound', inplace= True)

# then 'WN' & 'EW' that I have no idea what they are
data2019.Direction.replace(to_replace= 'WN', value= 'Unknown', inplace= True)
data2019.Direction.replace(to_replace= 'EW', value= 'Unknown', inplace= True)

# finally replacing nan with 'Unknown'
data2019.Direction.fillna(value= 0, inplace= True)
data2019.Direction.replace(to_replace= 0, value= 'Unknown', inplace= True)

# checking that everything worked fine 
# as well as the proportion of incidents in each direction
data2019.Direction.value_counts(normalize= True, dropna= False)

Westbound     0.472900
Eastbound     0.362229
Southbound    0.076839
Northbound    0.076671
Unknown       0.011362
Name: Direction, dtype: float64

In [27]:
# the normalized value counts in a more readable format
temp_var = data2019["Direction"].value_counts(normalize= True)
for i,j in temp_var.items():
    print(f"{round(temp_var[i]*100)}% of {i} incidents")

47.0% of Westbound incidents
36.0% of Eastbound incidents
8.0% of Southbound incidents
8.0% of Northbound incidents
1.0% of Unknown incidents


In [28]:
# actual values are also interesting
data2019.Direction.value_counts()

Westbound     5619
Eastbound     4304
Southbound     913
Northbound     911
Unknown        135
Name: Direction, dtype: int64

Most of the incidents seem to be on either Westbound or Eastbound routes. 

In [29]:
data2019.isna().sum()

Report Date    0
Route          0
Time           0
Day            0
Location       0
Incident       0
Direction      0
Gaps           0
Delays         0
dtype: int64

NICE !!! ü§©

Now that there are no nan left, working with proper dtypes should be a lot easier.

----------
----------
### Data cleaning - data types

In [30]:
data2019.dtypes

Report Date    datetime64[ns]
Route                   int64
Time                   object
Day                    object
Location               object
Incident               object
Direction              object
Gaps                  float64
Delays                float64
dtype: object

In [31]:
# let's understand these dtypes in context 
data2019.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Gaps,Delays
542,2019-10-20,509,08:59:00,Sunday,Bathurst and Fleet,General Delay,Westbound,18.0,9.0
298,2019-01-09,506,20:57:00,Wednesday,Gerrard/Parliament,Diversion,Westbound,84.0,77.0
102,2019-02-02,501,11:10:00,Saturday,leslie barns,Late Leaving Garage,Westbound,19.554898,2.0
1019,2019-01-24,504,06:46:00,Thursday,Leslie Barns,Late Leaving Garage,Westbound,1.0,6.0
712,2019-07-25,510,17:34:00,Thursday,Spadina Bremmer,Investigation,Northbound,11.0,7.0


- ‚úÖ report date is datetime as expected
- ‚ùå route should be a string instead of an integer (even though I agree Python, it looks like an int)
- ‚ùå time should be datetime
- ‚ùå day should be a string
- ‚ùå location should be a string too 
- ‚ùå idem for incident
- ‚ùå idem for Direction
- ‚úÖ gaps and delays are float as expected 

In [32]:
# let's try just found new function df.convert_dtypes()
data2019 = data2019.convert_dtypes()
data2019.dtypes

Report Date    datetime64[ns]
Route                   Int64
Time                   object
Day                    string
Location               string
Incident               string
Direction              string
Gaps                  float64
Delays                float64
dtype: object

Wow... Quite a change !! ü§©

Now I just need to figure out Time and Route and that's it. 

Let's focus on Time first. 

In [33]:
# first refreshing memory on what Time looks like
data2019.sample(2)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Gaps,Delays
210,2019-12-06,505,08:46:00,Friday,Dundas and Lansdowne,Investigation,Eastbound,10.0,5.0
1220,2019-01-28,501,04:26:00,Monday,Russell Yard,Mechanical,Eastbound,1.0,1.0


Time is in a 24hours written out in the format hh:mm:ss

In [34]:
# first making a copy so I don't have to reset all the way up
data = data2019.copy()

I only really care for the hour of the report so I'll extract the hour and put it in a column of its own. 

In [35]:
# 1. casting Time as string before retrying .str.split()
data["Time"] = data.Time.astype('str')

# 2. parsing Time 
hour_report = data.Time.str.split(':')
data['Report Hour'] = hour_report.str.get(0)

# 3. casting this new column to numerical type of data
# for further analysis
data['Report Hour'] = pd.to_numeric(data['Report Hour'], errors= 'coerce')

# 4. checking if all of this worked
print(data.isna().sum())
print("----")
data.dtypes

Report Date    0
Route          0
Time           0
Day            0
Location       0
Incident       0
Direction      0
Gaps           0
Delays         0
Report Hour    2
dtype: int64
----


Report Date    datetime64[ns]
Route                   Int64
Time                   object
Day                    string
Location               string
Incident               string
Direction              string
Gaps                  float64
Delays                float64
Report Hour           float64
dtype: object

Report Hour was cast to a float as expected, although it created two NaN in the process due to errors= 'coerce'. 

In [36]:
data.dropna(inplace= True)
data.isna().sum()

Report Date    0
Route          0
Time           0
Day            0
Location       0
Incident       0
Direction      0
Gaps           0
Delays         0
Report Hour    0
dtype: int64

One more thing to take care of is to extract and separate year, month and day in the report date.

In [37]:
data['Report Year'] = pd.DatetimeIndex(data['Report Date']).year
data['Report Month'] = pd.DatetimeIndex(data['Report Date']).month
data['Report Day'] = pd.DatetimeIndex(data['Report Date']).day
data['Report Quarter'] = pd.DatetimeIndex(data['Report Date']).quarter
data.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Gaps,Delays,Report Hour,Report Year,Report Month,Report Day,Report Quarter
308,2019-08-12,504,13:08:00,Monday,King/Dufferin,Investigation,Westbound,15.0,8.0,13.0,2019,8,12,3
167,2019-02-03,501,08:03:00,Sunday,Neville prk loop,Held By,Westbound,19.0,15.0,8.0,2019,2,3,1
297,2019-10-10,511,20:52:00,Thursday,Bathurst Station,Investigation,Eastbound,14.0,7.0,20.0,2019,10,10,4
780,2019-03-20,504,05:47:00,Wednesday,Queen St E and Leslie St,Late Leaving Garage,Westbound,8.0,4.0,5.0,2019,3,20,1
773,2019-04-26,501,16:22:00,Friday,Queen Street East Ontrario,Investigation,Eastbound,24.0,16.0,16.0,2019,4,26,2


In [38]:
data.sample(5)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Direction,Gaps,Delays,Report Hour,Report Year,Report Month,Report Day,Report Quarter
39,2019-07-02,501,15:16:00,Tuesday,Queen and Bathurst,Investigation,Eastbound,18.0,9.0,15.0,2019,7,2,3
937,2019-01-22,501,14:44:00,Tuesday,Queen/Woodbine,Mechanical,Eastbound,8.0,4.0,14.0,2019,1,22,1
258,2019-12-08,505,06:38:00,Sunday,Dundas and Bathurst,Mechanical,Eastbound,19.0,9.0,6.0,2019,12,8,4
1161,2019-02-24,501,14:44:00,Sunday,Lakeshore and Hillside,Mechanical,Westbound,20.0,10.0,14.0,2019,2,24,1
949,2019-03-25,512,14:49:00,Monday,Gunns Loop,Held By,Eastbound,15.0,10.0,14.0,2019,3,25,1


In [39]:
# checking to see the data type
data.dtypes

Report Date       datetime64[ns]
Route                      Int64
Time                      object
Day                       string
Location                  string
Incident                  string
Direction                 string
Gaps                     float64
Delays                   float64
Report Hour              float64
Report Year                int64
Report Month               int64
Report Day                 int64
Report Quarter             int64
dtype: object

I need to have all report columns cast as objects when they are currently integers. 

In [40]:
data['Incident'] = data['Incident'].astype('category')
data['Direction'] = data['Direction'].astype('category')
data['Report Hour'] = data['Report Hour'].astype('category')
data['Day'] = data['Day'].astype('category')
data['Report Year'] = data['Report Year'].astype('category')
data['Report Month'] = data['Report Month'].astype('category')
data['Report Day'] = data['Report Day'].astype('category')
data['Report Quarter'] = data['Report Quarter'].astype('category')
data.dtypes

Report Date       datetime64[ns]
Route                      Int64
Time                      object
Day                     category
Location                  string
Incident                category
Direction               category
Gaps                     float64
Delays                   float64
Report Hour             category
Report Year             category
Report Month            category
Report Day              category
Report Quarter          category
dtype: object

I'd like now to export this clean data into a file then easy to read again into a df. 

In [41]:
# first making a copy of the data
streetcars = data.copy()

# then exporting it to .csv 
streetcars.to_csv(path_or_buf= 'clean_data/streetcars_clean.csv',\
                 index= False)