In [5]:
import pandas as pd
from sqlalchemy import create_engine


In [7]:
engine = create_engine("sqlite:///mta_data.db")
turnstiles_df = pd.read_sql("SELECT * FROM mta_data;", engine)
turnstiles_df.head(4)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,03:00:00,REGULAR,7693605,2677444
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,07:00:00,REGULAR,7693608,2677454
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,11:00:00,REGULAR,7693627,2677547
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,15:00:00,REGULAR,7693672,2677621


In [20]:
turnstiles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1468127 entries, 0 to 1468126
Data columns (total 11 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   C/A       1468127 non-null  object
 1   UNIT      1468127 non-null  object
 2   SCP       1468127 non-null  object
 3   STATION   1468127 non-null  object
 4   LINENAME  1468127 non-null  object
 5   DIVISION  1468127 non-null  object
 6   DATE      1468127 non-null  object
 7   TIME      1468127 non-null  object
 8   DESC      1468127 non-null  object
 9   ENTRIES   1468127 non-null  int64 
 10  EXITS     1468127 non-null  int64 
dtypes: int64(2), object(9)
memory usage: 123.2+ MB


In [21]:
# remove any leading and trailing space of the columns
turnstiles_df.columns = [column.strip() for column in turnstiles_df.columns]
turnstiles_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [22]:
# Validate Dates to analyze 1/29/2022 - 3/19/2022
turnstiles_df.DATE.value_counts().sort_index()

01/29/2022    30001
01/30/2022    29901
01/31/2022    30062
02/01/2022    30126
02/02/2022    29997
02/03/2022    29998
02/04/2022    29998
02/05/2022    29986
02/06/2022    29936
02/07/2022    29945
02/08/2022    30012
02/09/2022    30034
02/10/2022    30005
02/11/2022    30086
02/12/2022    30131
02/13/2022    29950
02/14/2022    29975
02/15/2022    30247
02/16/2022    29995
02/17/2022    30317
02/18/2022    30003
02/19/2022    29952
02/20/2022    29927
02/21/2022    29894
02/22/2022    30052
02/23/2022    30146
02/24/2022    29995
02/25/2022    29883
02/26/2022    29921
02/27/2022    29969
02/28/2022    29934
03/01/2022    29966
03/02/2022    29974
03/03/2022    29990
03/04/2022    30025
03/05/2022    29970
03/06/2022    29954
03/07/2022    30126
03/08/2022    29965
03/09/2022    29989
03/10/2022    29943
03/11/2022    30291
03/12/2022    30349
03/13/2022    27218
03/14/2022    30064
03/15/2022    30022
03/16/2022    29963
03/17/2022    30042
03/18/2022    29898
Name: DATE, dtype: i

### Make Time Series

In [23]:
# Turn into time series
import datetime
turnstiles_df["DATE_TIME"] = pd.to_datetime(turnstiles_df.DATE + " " + turnstiles_df.TIME, format="%m/%d/%Y %H:%M:%S")

In [24]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,03:00:00,REGULAR,7693605,2677444,2022-03-12 03:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,07:00:00,REGULAR,7693608,2677454,2022-03-12 07:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,11:00:00,REGULAR,7693627,2677547,2022-03-12 11:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,15:00:00,REGULAR,7693672,2677621,2022-03-12 15:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,19:00:00,REGULAR,7693730,2677683,2022-03-12 19:00:00


### Check for Duplicates

In [25]:
# Interlude - check for duplicates
# isolate specific turnstiles by creating boolean series masks, then using them to index into the dataframe:
mask = ((turnstiles_df["C/A"] == "A002") &
        (turnstiles_df["UNIT"] == "R051") & 
        (turnstiles_df["SCP"] == "02-00-00") & 
        (turnstiles_df["STATION"] == "59 ST"))

turnstiles_df[mask].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,03:00:00,REGULAR,7693605,2677444,2022-03-12 03:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,07:00:00,REGULAR,7693608,2677454,2022-03-12 07:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,11:00:00,REGULAR,7693627,2677547,2022-03-12 11:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,15:00:00,REGULAR,7693672,2677621,2022-03-12 15:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2022,19:00:00,REGULAR,7693730,2677683,2022-03-12 19:00:00


In [26]:
# Sanity Check to verify that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique
(turnstiles_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES
707530,N506,R022,00-03-02,34 ST-HERALD SQ,2022-02-05 03:00:00,2
1368964,R533,R055,00-03-01,FLUSHING-MAIN,2022-01-29 15:00:00,2
0,A002,R051,02-00-00,59 ST,2022-01-29 03:00:00,1
978747,R141,R031,00-03-02,34 ST-PENN STA,2022-02-28 19:00:00,1
978755,R141,R031,00-03-02,34 ST-PENN STA,2022-03-02 03:00:00,1


In [27]:
# Seems we have two entries for same time at 34 ST-HERALD SQ and FLUSHING-MAIN

# Look at ST-HERALD SQ
mask = ((turnstiles_df["C/A"] == "N506") & 
(turnstiles_df["UNIT"] == "R022") & 
(turnstiles_df["SCP"] == "00-03-02") & 
(turnstiles_df["STATION"] == "34 ST-HERALD SQ") &
(turnstiles_df["DATE_TIME"].dt.date == datetime.datetime(2022, 2, 5).date()))

turnstiles_df[mask].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
1149252,N506,R022,00-03-02,34 ST-HERALD SQ,BDFMNQRW,IND,02/05/2022,03:00:00,REGULAR,870698,1007524,2022-02-05 03:00:00
1149253,N506,R022,00-03-02,34 ST-HERALD SQ,BDFMNQRW,IND,02/05/2022,03:00:00,RECOVR AUD,870666,1007524,2022-02-05 03:00:00
1149254,N506,R022,00-03-02,34 ST-HERALD SQ,BDFMNQRW,IND,02/05/2022,07:00:00,REGULAR,870700,1007539,2022-02-05 07:00:00
1149255,N506,R022,00-03-02,34 ST-HERALD SQ,BDFMNQRW,IND,02/05/2022,11:00:00,REGULAR,870712,1007611,2022-02-05 11:00:00
1149256,N506,R022,00-03-02,34 ST-HERALD SQ,BDFMNQRW,IND,02/05/2022,15:00:00,REGULAR,870762,1007763,2022-02-05 15:00:00


In [28]:
# Look at FLUSHING-MAIN

mask2 = ((turnstiles_df["C/A"] == "R533") & 
(turnstiles_df["UNIT"] == "R055") & 
(turnstiles_df["SCP"] == "00-03-01") & 
(turnstiles_df["STATION"] == "FLUSHING-MAIN") &
(turnstiles_df["DATE_TIME"].dt.date == datetime.datetime(2022, 1, 29).date()))

turnstiles_df[mask2].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
1453861,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,03:00:00,REGULAR,5390246,3015053,2022-01-29 03:00:00
1453862,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,07:00:00,REGULAR,5390298,3015082,2022-01-29 07:00:00
1453863,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,11:00:00,REGULAR,5390516,3015160,2022-01-29 11:00:00
1453864,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,15:00:00,REGULAR,5390653,3015270,2022-01-29 15:00:00
1453865,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,15:00:00,RECOVR AUD,5390653,3015269,2022-01-29 15:00:00


### Remove Duplicates
Dupe data doesn't shift results. Remove duplicates so no confusion. 

In [29]:
# need to create a subset. If DESC was included it wouldn't find dupe because of unique DESC for dupe. 
turnstiles_df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)

In [30]:
# check to see if dupes were removed - yes they are
mask2 = ((turnstiles_df["C/A"] == "R533") & 
(turnstiles_df["UNIT"] == "R055") & 
(turnstiles_df["SCP"] == "00-03-01") & 
(turnstiles_df["STATION"] == "FLUSHING-MAIN") &
(turnstiles_df["DATE_TIME"].dt.date == datetime.datetime(2022, 1, 29).date()))

turnstiles_df[mask2].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
1453861,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,03:00:00,REGULAR,5390246,3015053,2022-01-29 03:00:00
1453862,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,07:00:00,REGULAR,5390298,3015082,2022-01-29 07:00:00
1453863,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,11:00:00,REGULAR,5390516,3015160,2022-01-29 11:00:00
1453864,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,15:00:00,REGULAR,5390653,3015270,2022-01-29 15:00:00
1453866,R533,R055,00-03-01,FLUSHING-MAIN,7,IRT,01/29/2022,19:00:00,REGULAR,5390768,3015443,2022-01-29 19:00:00


In [31]:
# export dataframe to csv
#turnstiles_df.to_csv('turnstiles_cleaned.csv')

In [34]:
# export dataframe to csv turn off index
# turnstiles_df.to_csv('turnstiles_cleaned2.csv', index=False)