In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

%config InlineBackend.figure_format = 'svg'
%matplotlib inline 
df = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_190921.txt', parse_dates=[['DATE', 'TIME']])
df['day_of_week'] = df['DATE_TIME'].dt.weekday_name
df.columns

Index(['DATE_TIME', 'C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION',
       'DESC', 'ENTRIES',
       'EXITS                                                               ',
       'day_of_week'],
      dtype='object')

In [2]:
# Rename exit
df.rename(columns ={"EXITS                                                               ":"EXITS"}, inplace=True)

In [3]:
# Get only regular/ working units. The irregulars could be contributing to outliers
df = df[df['DESC'] == 'REGULAR']

In [4]:
df.describe()

Unnamed: 0,ENTRIES,EXITS
count,204235.0,204235.0
mean,42155940.0,34905720.0
std,215758100.0,199647100.0
min,0.0,0.0
25%,289650.0,129228.0
50%,2038905.0,1178385.0
75%,6615760.0,4534095.0
max,2129093000.0,2123772000.0


In [5]:
# Create dif scores for entries / exits
df['ENTRIES_diff'] = df.ENTRIES.diff()
df['EXITS_diff'] = df.EXITS.diff()
df['TRAFFIC_FLOW'] = df['ENTRIES_diff'] + df['EXITS_diff']

In [6]:
df.columns

Index(['DATE_TIME', 'C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION',
       'DESC', 'ENTRIES', 'EXITS', 'day_of_week', 'ENTRIES_diff', 'EXITS_diff',
       'TRAFFIC_FLOW'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,DATE_TIME,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,day_of_week,ENTRIES_diff,EXITS_diff,TRAFFIC_FLOW
0,2019-09-14 00:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198818,2438323,Saturday,,,
1,2019-09-14 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198834,2438325,Saturday,16.0,2.0,18.0
2,2019-09-14 08:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198847,2438354,Saturday,13.0,29.0,42.0
3,2019-09-14 12:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198929,2438428,Saturday,82.0,74.0,156.0
4,2019-09-14 16:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199125,2438483,Saturday,196.0,55.0,251.0


In [8]:
df['TRAFFIC_FLOW'].describe()

count    2.042340e+05
mean    -4.715751e+01
std      8.230804e+07
min     -3.959976e+09
25%      2.500000e+01
50%      1.780000e+02
75%      4.920000e+02
max      3.959539e+09
Name: TRAFFIC_FLOW, dtype: float64

In [9]:
# set date_time to index
df.set_index('DATE_TIME', inplace = True)

In [10]:
df.head()

Unnamed: 0_level_0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,day_of_week,ENTRIES_diff,EXITS_diff,TRAFFIC_FLOW
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-09-14 00:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198818,2438323,Saturday,,,
2019-09-14 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198834,2438325,Saturday,16.0,2.0,18.0
2019-09-14 08:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198847,2438354,Saturday,13.0,29.0,42.0
2019-09-14 12:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7198929,2438428,Saturday,82.0,74.0,156.0
2019-09-14 16:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7199125,2438483,Saturday,196.0,55.0,251.0


In [11]:
df['ENTRIES'].describe()

count    2.042350e+05
mean     4.215594e+07
std      2.157581e+08
min      0.000000e+00
25%      2.896500e+05
50%      2.038905e+06
75%      6.615760e+06
max      2.129093e+09
Name: ENTRIES, dtype: float64

In [12]:
# Replace outliers: delete under zero, replace > 20,000 with mean
# GROUPED BY STATION-- so means are coming by station

df['ENTRIES_diff'] = df.groupby(['STATION']).ENTRIES_diff.transform(
    lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))

In [13]:
df.ENTRIES_diff.describe()

count    204234.000000
mean        188.768041
std         360.117240
min           0.000000
25%          10.000000
50%          84.000000
75%         254.000000
max       19998.000000
Name: ENTRIES_diff, dtype: float64

In [14]:
df['EXITS_diff'] = df.groupby(['STATION']).EXITS_diff.transform(
    lambda x: np.where((x<0)|(x>20000),x.mask((x<0)|(x>20000)).mean(),x))

In [15]:
df.EXITS_diff.describe()

count    204234.000000
mean        153.670505
std         372.759533
min           0.000000
25%           9.000000
50%          60.000000
75%         182.458537
max       19987.000000
Name: EXITS_diff, dtype: float64

In [16]:
# Look at station level
gp_station = df.groupby('STATION')

In [17]:
gp_station_ENTRIESdiff = df.groupby(['STATION', 'ENTRIES_diff'])

In [None]:
gp_station_ENTRIESdiff.describe()