In [258]:
import numpy as np
import pandas as pd
from datetime import datetime

## Explore the dataset

I use pandas for performance and interoffice sharing. I could very easily implement all of the below code using csv readers, strings + lists, etc - but I have found that sharing DFs is simply easier. Especially when some quants use R and Julia, etc. Doing the processing and analysis with a more manual implementation could potentially have certain performance advantages depending on exact implementation, but I generally think Pandas forces me to write cleaner and more precise code - which allows another quant to make adjustments easily without digging too far into deeply nested loops and function calls.  

In [34]:
sample_df = pd.read_csv('data/turnstile_130803.txt', header=None)
# sample_df.head()
sample_df[0:7] # Use to show first known row with NaN

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,A002,R051,02-00-00,07-27-13,00:00:00,REGULAR,4209603,1443585,07-27-13,04:00:00,...,07-28-13,00:00:00,REGULAR,4210432.0,1443801.0,07-28-13,04:00:00,REGULAR,4210472.0,1443805.0
1,A002,R051,02-00-00,07-28-13,08:00:00,REGULAR,4210490,1443821,07-28-13,12:00:00,...,07-29-13,08:00:00,REGULAR,4211176.0,1444042.0,07-29-13,12:00:00,REGULAR,4211350.0,1444247.0
2,A002,R051,02-00-00,07-29-13,16:00:00,REGULAR,4211586,1444302,07-29-13,20:00:00,...,07-30-13,14:01:00,DOOR OPEN,4213192.0,1444700.0,07-30-13,14:01:35,DOOR OPEN,4213192.0,1444700.0
3,A002,R051,02-00-00,07-30-13,14:01:46,LOGON,4213192,1444700,07-30-13,14:01:49,...,07-30-13,14:07:55,DOOR OPEN,4213192.0,1444700.0,07-30-13,14:11:56,DOOR CLOSE,4213192.0,1444700.0
4,A002,R051,02-00-00,07-30-13,16:00:00,REGULAR,4213333,1444737,07-30-13,20:00:00,...,07-31-13,16:00:00,REGULAR,4214863.0,1445194.0,07-31-13,20:00:00,REGULAR,4215656.0,1445251.0
5,A002,R051,02-00-00,08-01-13,00:00:00,REGULAR,4215894,1445274,08-01-13,04:00:00,...,08-02-13,00:00:00,REGULAR,4217489.0,1445773.0,08-02-13,04:00:00,REGULAR,4217532.0,1445775.0
6,A002,R051,02-00-00,08-02-13,08:00:00,REGULAR,4217563,1445856,08-02-13,12:00:00,...,,,,,,,,,,


Confirm for NaN values

In [39]:
sample_df.isnull().any()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42     True
dtype: bool

So no col names, and we can see that the 6th row has NaN data. At least some rows have NaN after the 7th col. Let's explore the description fields for more info. This data is from before 10/18/2014 so we will use that one.

In [40]:
print(open('data/ts_Field_Description_pre-10-18-2014.txt').read())

Field Description

C/A,UNIT,SCP,DATE1,TIME1,DESC1,ENTRIES1,EXITS1,DATE2,TIME2,DESC2,ENTRIES2,EXITS2,DATE3,TIME3,DESC3,ENTRIES3,EXITS3,DATE4,TIME4,DESC4,ENTRIES4,EXITS4,DATE5,TIME5,DESC5,ENTRIES5,EXITS5,DATE6,TIME6,DESC6,ENTRIES6,EXITS6,DATE7,TIME7,DESC7,ENTRIES7,EXITS7,DATE8,TIME8,DESC8,ENTRIES8,EXITS8


C/A = Control Area (A002)
UNIT = Remote Unit for a station (R051)
SCP = Subunit Channel Position represents an specific address for a device (02-00-00)
DATEn = Represents the date (MM-DD-YY)
TIMEn = Represents the time (hh:mm:ss) for a scheduled audit event
DEScn = Represent the "REGULAR" scheduled audit event (occurs every 4 hours)
ENTRIESn = The comulative entry register value for a device
EXISTn = The cumulative exit register value for a device



Example:
The data below shows the entry/exit register values for one turnstile at control area (A002) from 03/21/10 at 00:00 hours to 03/28/10 at 20:00 hours


A002,R051,02-00-00,03-21-10,00:00:00,REGULAR,002670738,000917107,03-21-10,04:00

## Generate known columns

Know a Max of 8 N. Also assuming that there is a typo for `EXISTS`. This is just not fun to work with so I'm taking the liberty to rename it `EXITS`

In [56]:
cols_by_id = [
    "ControlArea",
    "RemoteUnit", 
    "SCP"
]

""" add in N extra fields, just as field description says """
extra_cols = [
    "DATE",
    "TIME",
    "DESC",
    "ENTRIES",
    "EXITS", # typo on their part, corrected `EXISTS` to `EXITS` for sanity
]

max_n = 8 # Because guide says so
''' Because MTA has already made at least one blatant typo so lets be safe and make this work for arbitrary datasets,
lets take the max. Thinking I can reuse most of this code for processing N of the data files for the interactive app '''
max_n = max(
    max_n,
    (len(sample_df.columns) - len(cols_by_id)) / len(extra_cols) # (Num cols - known manditory per row) / size of group
)

for i_position_in_row in xrange(max_n):
    position_in_row = str(i_position_in_row)
    for extra_col in extra_cols:
        cols_by_id.append(extra_col + "_" + position_in_row)
print(cols_by_id)

['ControlArea', 'RemoteUnit', 'SCP', 'DATE_0', 'TIME_0', 'DESC_0', 'ENTRIES_0', 'EXITS_0', 'DATE_1', 'TIME_1', 'DESC_1', 'ENTRIES_1', 'EXITS_1', 'DATE_2', 'TIME_2', 'DESC_2', 'ENTRIES_2', 'EXITS_2', 'DATE_3', 'TIME_3', 'DESC_3', 'ENTRIES_3', 'EXITS_3', 'DATE_4', 'TIME_4', 'DESC_4', 'ENTRIES_4', 'EXITS_4', 'DATE_5', 'TIME_5', 'DESC_5', 'ENTRIES_5', 'EXITS_5', 'DATE_6', 'TIME_6', 'DESC_6', 'ENTRIES_6', 'EXITS_6', 'DATE_7', 'TIME_7', 'DESC_7', 'ENTRIES_7', 'EXITS_7']


# Read in DF

This time with columns! Will still need to format data, join to figure out what ControlArea, RemoteUnit, SCP are in regards to a human readable location.

In [43]:
df_raw = pd.read_csv(
    'data/turnstile_130803.txt',
    names=cols_by_id
)

df_raw.head()

Unnamed: 0,ControlArea,RemoteUnit,SCP,DATE_0,TIME_0,DESC_0,ENTRIES_0,EXITS_0,DATE_1,TIME_1,...,DATE_6,TIME_6,DESC_6,ENTRIES_6,EXITS_6,DATE_7,TIME_7,DESC_7,ENTRIES_7,EXITS_7
0,A002,R051,02-00-00,07-27-13,00:00:00,REGULAR,4209603,1443585,07-27-13,04:00:00,...,07-28-13,00:00:00,REGULAR,4210432,1443801,07-28-13,04:00:00,REGULAR,4210472,1443805
1,A002,R051,02-00-00,07-28-13,08:00:00,REGULAR,4210490,1443821,07-28-13,12:00:00,...,07-29-13,08:00:00,REGULAR,4211176,1444042,07-29-13,12:00:00,REGULAR,4211350,1444247
2,A002,R051,02-00-00,07-29-13,16:00:00,REGULAR,4211586,1444302,07-29-13,20:00:00,...,07-30-13,14:01:00,DOOR OPEN,4213192,1444700,07-30-13,14:01:35,DOOR OPEN,4213192,1444700
3,A002,R051,02-00-00,07-30-13,14:01:46,LOGON,4213192,1444700,07-30-13,14:01:49,...,07-30-13,14:07:55,DOOR OPEN,4213192,1444700,07-30-13,14:11:56,DOOR CLOSE,4213192,1444700
4,A002,R051,02-00-00,07-30-13,16:00:00,REGULAR,4213333,1444737,07-30-13,20:00:00,...,07-31-13,16:00:00,REGULAR,4214863,1445194,07-31-13,20:00:00,REGULAR,4215656,1445251


## Figure out ControlArea + RemoteUnit

Let's attempt the join between stations first before we split data into more rows and less columns, since less rows right now, so O(n) will be nominally smaller. No sense in doing more work for the same result.

Excel file, but pandas is a champ so I no longer start by implementing with [`openpyxl`](https://pypi.python.org/pypi/openpyxl) - or if needing old excel format either [`xlrd`](https://pypi.python.org/pypi/xlrd) or [`xlwt`](https://pypi.python.org/pypi/xlwt). When the excel is performatted and has a lot of other data I usually revert to implementing one of the above for sanity reasons.

But I still do prefer writing excels with [`XlsxWriter`](https://pypi.python.org/pypi/XlsxWriter) when the there are certain formatting requests, colors, charts, etc.

In [57]:
stations = pd.read_excel('data/Remote-Booth-Station.xls')
stations.head()

Unnamed: 0,Remote,Booth,Station,Line Name,Division
0,R001,A060,WHITEHALL ST,R1,BMT
1,R001,A058,WHITEHALL ST,R1,BMT
2,R001,R101S,SOUTH FERRY,R1,IRT
3,R002,A077,FULTON ST,ACJZ2345,BMT
4,R002,A081,FULTON ST,ACJZ2345,BMT


Looks like the mappings by values and names are the following:

|Stations Col | Turnstile Col|
|:--|:--|
|Remote|RemoteUnit|
|Booth|ControlArea|

Let's confirm, at least for the head()

In [60]:
stations.loc[(stations['Remote'] == 'R051') & (stations['Booth'] == 'A002')]

Unnamed: 0,Remote,Booth,Station,Line Name,Division
117,R051,A002,LEXINGTON AVE,456NQR,BMT


### Attempt the Join

Be sure to check if any nan and that the join was complete. Wouldn't put it past the MTA to be missing some or rename since there are already different field descriptions for data prior to 10/18/14

In [103]:
excel_merged = pd.merge(
    df_raw,
    stations,
    left_on=["ControlArea", "RemoteUnit"],
    right_on=["Booth", "Remote"],
    how="inner"
)

excel_merged.head()

Unnamed: 0,ControlArea,RemoteUnit,SCP,DATE_0,TIME_0,DESC_0,ENTRIES_0,EXITS_0,DATE_1,TIME_1,...,DATE_7,TIME_7,DESC_7,ENTRIES_7,EXITS_7,Remote,Booth,Station,Line Name,Division
0,A002,R051,02-00-00,07-27-13,00:00:00,REGULAR,4209603,1443585,07-27-13,04:00:00,...,07-28-13,04:00:00,REGULAR,4210472,1443805,R051,A002,LEXINGTON AVE,456NQR,BMT
1,A002,R051,02-00-00,07-28-13,08:00:00,REGULAR,4210490,1443821,07-28-13,12:00:00,...,07-29-13,12:00:00,REGULAR,4211350,1444247,R051,A002,LEXINGTON AVE,456NQR,BMT
2,A002,R051,02-00-00,07-29-13,16:00:00,REGULAR,4211586,1444302,07-29-13,20:00:00,...,07-30-13,14:01:35,DOOR OPEN,4213192,1444700,R051,A002,LEXINGTON AVE,456NQR,BMT
3,A002,R051,02-00-00,07-30-13,14:01:46,LOGON,4213192,1444700,07-30-13,14:01:49,...,07-30-13,14:11:56,DOOR CLOSE,4213192,1444700,R051,A002,LEXINGTON AVE,456NQR,BMT
4,A002,R051,02-00-00,07-30-13,16:00:00,REGULAR,4213333,1444737,07-30-13,20:00:00,...,07-31-13,20:00:00,REGULAR,4215656,1445251,R051,A002,LEXINGTON AVE,456NQR,BMT


### Check Length of df since inner join.

This will tell us if we have some corrections or heuristics to follow

In [178]:
print("DF originally", len(df_raw))
print("DF With Stations", len(excel_merged))
num_missing = len(df_raw) - len(excel_merged)
print("Missing...", num_missing)

excel_merged.isnull().any()

('DF originally', 29427)
('DF With Stations', 28895)
('Missing...', 532)


ControlArea    False
RemoteUnit     False
SCP            False
DATE_0         False
TIME_0         False
DESC_0         False
ENTRIES_0      False
EXITS_0        False
DATE_1          True
TIME_1          True
DESC_1          True
ENTRIES_1       True
EXITS_1         True
DATE_2          True
TIME_2          True
DESC_2          True
ENTRIES_2       True
EXITS_2         True
DATE_3          True
TIME_3          True
DESC_3          True
ENTRIES_3       True
EXITS_3         True
DATE_4          True
TIME_4          True
DESC_4          True
ENTRIES_4       True
EXITS_4         True
DATE_5          True
TIME_5          True
DESC_5          True
ENTRIES_5       True
EXITS_5         True
DATE_6          True
TIME_6          True
DESC_6          True
ENTRIES_6       True
EXITS_6         True
DATE_7          True
TIME_7          True
DESC_7          True
ENTRIES_7       True
EXITS_7         True
Remote         False
Booth          False
Station        False
Line Name      False
Division     

The Rows we don't want to see after the join are:
    
```
Remote         False
Booth          False
Station        False
Line Name      False
Division       False
```

Now technically speaking `532` rows are not that significant out of `29427`, except these are all for certain stations.
`532` X `8` (max entries per row) means `4256` possible entries missing out of:
`29427` X `8` = `235416`. Percentage missing is `4256` / `235416` = ~`1.8`%, which is not too many, but to confirm a real result, we should figure out if we can get these labeled. It is theoretically possible these stations/booths could move a station into the top or bottom of how we define a `"busy"` station.

### Figure out the missing Control Area and Remote Units

In [179]:
missing_with_excel = pd.merge(
    df_raw,
    stations,
    left_on=["ControlArea", "RemoteUnit"],
    right_on=["Booth", "Remote"],
    how="left" # Left join this time, will yield a result with the NaN rows retained 
)

rows_with_missing = missing_with_excel.loc[missing_with_excel['Station'].isnull()]

print("Found all missing", len(rows_with_missing) == num_missing)

unlinked_stations_from_df = rows_with_missing[['ControlArea', 'RemoteUnit']].drop_duplicates(
    subset=['ControlArea', 'RemoteUnit']
)

unlinked_stations_from_df
# missing_with_excel['miss'] = missing_with_excel.ix[missing_with_excel['Station']]

# missing_with_excel.head()

('Found all missing', True)


Unnamed: 0,ControlArea,RemoteUnit
1694,A077,R028
1718,A081,R028
1763,A082,R028
8805,N098,R028
12207,N330,R202
17948,R101,R001
20126,R169,R168
28119,R612,R057


### Check what rows in `Stations` Excel are not found in this data set

Check if the above are off by a number or letter, possible typo in their identifiers? Maybe will yield interesting info. Very possible they could have added or removed stations from this list too. I imagine that these booths get periordically retired. Or in the case of a remodeling like at Fulton Street, they may not put the booths back at the same order, or may increase or decrease (take away, retire, repurpose) a booth.

In [180]:
missing_with_excel = pd.merge(
    stations,
    df_raw[['ControlArea', 'RemoteUnit']].drop_duplicates(
        subset=['ControlArea', 'RemoteUnit']
    ),
    left_on=['Booth', 'Remote'],
    right_on=['ControlArea', 'RemoteUnit'],
    how='left'
)

unlinked_stations_from_stations = missing_with_excel.loc[missing_with_excel['ControlArea'].isnull()]

unlinked_stations_from_stations.head()

Unnamed: 0,Remote,Booth,Station,Line Name,Division,ControlArea,RemoteUnit
2,R001,R101S,SOUTH FERRY,R1,IRT,,
3,R002,A077,FULTON ST,ACJZ2345,BMT,,
4,R002,A081,FULTON ST,ACJZ2345,BMT,,
5,R002,A082,FULTON ST,ACJZ2345,BMT,,
20,R012,N065,34 ST-PENN STA,ACE,IND,,


In [208]:
unlinked_stations_from_stations.loc[unlinked_stations_from_stations['Booth'] == 'A077']

Unnamed: 0,Remote,Booth,Station,Line Name,Division,ControlArea,RemoteUnit
3,R002,A077,FULTON ST,ACJZ2345,BMT,,


In [207]:
unlinked_stations_from_stations.loc[unlinked_stations_from_stations['Booth'] == 'A082']

Unnamed: 0,Remote,Booth,Station,Line Name,Division,ControlArea,RemoteUnit
5,R002,A082,FULTON ST,ACJZ2345,BMT,,


In [238]:
print("Original stations len", len(stations))
def getUniqueStationData(possible_stations):
    ''' line name may be multiple '''
    unique_station = possible_stations[['Station', 'Division', 'Line Name']].drop_duplicates(
        subset=['Station', 'Division', 'Line Name']
    )

    if len(unique_station) != 1:
        ''' aribitrarily return the first one... '''
        print('Error cannot find unique, returning first. This is due to `Line Name`')
        
        unique_station = unique_station[['Station', 'Division']].drop_duplicates(
            subset=['Station', 'Division']
        )
        
        unique_station['Line Name'] = possible_stations['Line Name'][0:1]
        
        if len(unique_station) != 1:
            raise 'Error cannot find unique, due to different column'
            
        return unique_station

    return unique_station


station_data_to_merge = []

for i_row, row in unlinked_stations_from_df.iterrows():
#     print('Checking:', 'ControlArea:', row['ControlArea'], 'RemoteUnit', row['RemoteUnit'])

    data_to_fill_in = None
    
    ''' first check booth '''
    same_booth = unlinked_stations_from_stations.loc[unlinked_stations_from_stations['Booth'] == row['ControlArea']]
    if len(same_booth) == 0:
#         print('Not Found via booth')
        same_remote = unlinked_stations_from_stations.loc[unlinked_stations_from_stations['Remote'] == row['RemoteUnit']]
        
        if len(same_remote) == 0:
            pass
#             print('Not Found via Remote')
        elif len(same_remote) == 1:
#             print('Found via remote')
            data_to_fill_in = getUniqueStationData(same_remote)
        elif len(same_remote) > 1:
            data_to_fill_in = getUniqueStationData(same_remote)
#             print('Futher discover via remote')
    elif len(same_booth) == 1:
        data_to_fill_in = getUniqueStationData(same_booth)
#         print('Found via booth')
    elif len(same_booth) > 1:
        data_to_fill_in = getUniqueStationData(same_booth)
#         print('Futher discover via booth')

    if data_to_fill_in is None:
        continue
    
    data_to_fill_in['Booth'] = row['ControlArea']
    data_to_fill_in['Remote'] = row['RemoteUnit']
        
    ''' pd.append returns a copy of the df, can be slow if large dataset. Prefer this
    method of using a list if small RAM '''
    station_data_to_merge.append(data_to_fill_in.to_dict('records')[0])

station_data_to_merge = pd.DataFrame(station_data_to_merge)

''' mark to know we created these columns'''
station_data_to_merge['created_by_human'] = True
stations_original = stations.copy(deep=True) # Don't interfere with above cells
stations_original['created_by_human'] = False

stations_added = pd.concat([stations_original, station_data_to_merge], ignore_index=True)
print("End stations len", len(stations_added))

stations_added.tail()

('Original stations len', 768)
Error cannot find unique, returning first. This is due to `Line Name`
('End stations len', 776)


Unnamed: 0,Booth,Division,Line Name,Remote,Station,created_by_human
771,N098,IRT,2345ACJZ,R028,FULTON ST,True
772,N330,IND,MR,R202,63 DR-REGO PARK,True
773,R101,IRT,R1,R001,SOUTH FERRY,True
774,R169,IRT,123,R168,96 ST,True
775,R612,BMT,2345BDNQR,R057,ATLANTIC AVE,True


## Proper Full Join with Stations!

Rebuilt stations above to have all booths and remotes. Used "heuristic" of matching on a single columns of either `Booth` or `Remote` instead of both like before. While cannot be 100% confident, I think this is a fair assumption. We marked these rows as manually massaged, so if their total ends up being significant in either way we can account for this in later analysis.

In [239]:
df_with_stations = pd.merge(
    df_raw,
    stations_added,
    left_on=["ControlArea", "RemoteUnit"],
    right_on=["Booth", "Remote"],
    how="inner"
)

df_with_stations.head()

Unnamed: 0,ControlArea,RemoteUnit,SCP,DATE_0,TIME_0,DESC_0,ENTRIES_0,EXITS_0,DATE_1,TIME_1,...,TIME_7,DESC_7,ENTRIES_7,EXITS_7,Booth,Division,Line Name,Remote,Station,created_by_human
0,A002,R051,02-00-00,07-27-13,00:00:00,REGULAR,4209603,1443585,07-27-13,04:00:00,...,04:00:00,REGULAR,4210472,1443805,A002,BMT,456NQR,R051,LEXINGTON AVE,False
1,A002,R051,02-00-00,07-28-13,08:00:00,REGULAR,4210490,1443821,07-28-13,12:00:00,...,12:00:00,REGULAR,4211350,1444247,A002,BMT,456NQR,R051,LEXINGTON AVE,False
2,A002,R051,02-00-00,07-29-13,16:00:00,REGULAR,4211586,1444302,07-29-13,20:00:00,...,14:01:35,DOOR OPEN,4213192,1444700,A002,BMT,456NQR,R051,LEXINGTON AVE,False
3,A002,R051,02-00-00,07-30-13,14:01:46,LOGON,4213192,1444700,07-30-13,14:01:49,...,14:11:56,DOOR CLOSE,4213192,1444700,A002,BMT,456NQR,R051,LEXINGTON AVE,False
4,A002,R051,02-00-00,07-30-13,16:00:00,REGULAR,4213333,1444737,07-30-13,20:00:00,...,20:00:00,REGULAR,4215656,1445251,A002,BMT,456NQR,R051,LEXINGTON AVE,False


In [240]:
print("DF originally", len(df_raw))
print("DF With Stations", len(df_with_stations))
num_missing = len(df_raw) - len(df_with_stations)
print("Missing...", num_missing)

df_with_stations.isnull().any()

('DF originally', 29427)
('DF With Stations', 29427)
('Missing...', 0)


ControlArea         False
RemoteUnit          False
SCP                 False
DATE_0              False
TIME_0              False
DESC_0              False
ENTRIES_0           False
EXITS_0             False
DATE_1               True
TIME_1               True
DESC_1               True
ENTRIES_1            True
EXITS_1              True
DATE_2               True
TIME_2               True
DESC_2               True
ENTRIES_2            True
EXITS_2              True
DATE_3               True
TIME_3               True
DESC_3               True
ENTRIES_3            True
EXITS_3              True
DATE_4               True
TIME_4               True
DESC_4               True
ENTRIES_4            True
EXITS_4              True
DATE_5               True
TIME_5               True
DESC_5               True
ENTRIES_5            True
EXITS_5              True
DATE_6               True
TIME_6               True
DESC_6               True
ENTRIES_6            True
EXITS_6              True
DATE_7      

# Reduce Dimensions

In [254]:
sanitized_rows = []

for i_row, row in df_with_stations.iterrows():
    
#     if i_row > 5:
#         break
        
    for i_time_interval in xrange(max_n):
        time_interval = str(i_time_interval)
        
        """ only those which do not have NaN as this grouping """
        if pd.notnull(row['DATE_' + time_interval]) is False:
            break
        
        ''' rename columns to be all same relative formatting '''
        sanitized_row = {
            "remote_unit": row["RemoteUnit"],
            "scp": row["SCP"],
            "control_area": row["ControlArea"],
            "date": row['DATE_' + time_interval],
            "entries": row["ENTRIES_" + time_interval],
            "exits": row["EXITS_" + time_interval],
            "description": row['DESC_' + time_interval],
            "time": row['TIME_' + time_interval],
            
            "division": row['Division'],
            "line_name": row['Line Name'],
            "station": row['Station'],
            "created_by_human": row['created_by_human'],
            
            "interval": i_time_interval,
        }
        
        sanitized_rows.append(sanitized_row)

df = pd.DataFrame(sanitized_rows)
df.head()

Unnamed: 0,control_area,created_by_human,date,description,division,entries,exits,interval,line_name,remote_unit,scp,station,time
0,A002,False,07-27-13,REGULAR,BMT,4209603,1443585,0,456NQR,R051,02-00-00,LEXINGTON AVE,00:00:00
1,A002,False,07-27-13,REGULAR,BMT,4209643,1443593,1,456NQR,R051,02-00-00,LEXINGTON AVE,04:00:00
2,A002,False,07-27-13,REGULAR,BMT,4209663,1443616,2,456NQR,R051,02-00-00,LEXINGTON AVE,08:00:00
3,A002,False,07-27-13,REGULAR,BMT,4209741,1443687,3,456NQR,R051,02-00-00,LEXINGTON AVE,12:00:00
4,A002,False,07-27-13,REGULAR,BMT,4210004,1443740,4,456NQR,R051,02-00-00,LEXINGTON AVE,16:00:00


In [255]:
df.describe()

Unnamed: 0,created_by_human,entries,exits,interval
count,217844,217844.0,217844.0,217844.0
mean,0.01787059,5600449.0,3309910.0,3.391932
std,0.1324814,35516150.0,34941690.0,2.296694
min,False,-931476900.0,-878648000.0,0.0
25%,0,345471.0,203334.2,1.0
50%,0,2027753.0,1233320.0,3.0
75%,0,5163563.0,3670348.0,5.0
max,True,916848700.0,862432200.0,7.0


# Convert Dtypes

In [257]:
df.dtypes

control_area         object
created_by_human       bool
date                 object
description          object
division             object
entries             float64
exits               float64
interval              int64
line_name            object
remote_unit          object
scp                  object
station              object
time                 object
dtype: object

In [264]:
""" datetimes """ 
df_corrected = df.copy(deep=True)

df_corrected['datetime'] = pd.to_datetime(
    df_corrected['date'] + df_corrected['time'],
    format="%m-%d-%y%X"
)
df_corrected['date'] = pd.to_datetime(
    df_corrected['date'],
    format="%m-%d-%y"
)

del df_corrected['time']

""" numeric """
df_corrected['entries'] = df_corrected['entries'].convert_objects(
    convert_numeric=True
).astype(int)
df_corrected['exits'] = df_corrected['exits'].convert_objects(
    convert_numeric=True
).astype(int)

In [265]:
df_corrected.dtypes

control_area                object
created_by_human              bool
date                datetime64[ns]
description                 object
division                    object
entries                      int64
exits                        int64
interval                     int64
line_name                   object
remote_unit                 object
scp                         object
station                     object
datetime            datetime64[ns]
dtype: object

In [267]:
df_corrected.tail()

Unnamed: 0,control_area,created_by_human,date,description,division,entries,exits,interval,line_name,remote_unit,scp,station,datetime
217839,TRAM2,False,2013-08-02,REGULAR,RIT,5554,121,5,R,R469,00-05-01,RIT-ROOSEVELT,2013-08-02 05:00:00
217840,TRAM2,False,2013-08-02,REGULAR,RIT,5554,121,6,R,R469,00-05-01,RIT-ROOSEVELT,2013-08-02 09:00:00
217841,TRAM2,False,2013-08-02,REGULAR,RIT,5554,121,7,R,R469,00-05-01,RIT-ROOSEVELT,2013-08-02 13:00:00
217842,TRAM2,False,2013-08-02,REGULAR,RIT,5554,121,0,R,R469,00-05-01,RIT-ROOSEVELT,2013-08-02 17:00:00
217843,TRAM2,False,2013-08-02,REGULAR,RIT,5554,121,1,R,R469,00-05-01,RIT-ROOSEVELT,2013-08-02 21:00:00


In [268]:
df_corrected.isnull().any()

control_area        False
created_by_human    False
date                False
description         False
division            False
entries             False
exits               False
interval            False
line_name           False
remote_unit         False
scp                 False
station             False
datetime            False
dtype: bool