In [1]:
# Importing packages
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sqlalchemy import create_engine

In [3]:
# Accessing data from database
engine = create_engine('sqlite:///mta.db')


In [7]:
pd.read_sql('select * from sqlite_master where type = "table" ;', engine)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,mta_dataset,mta_dataset,2,"CREATE TABLE ""mta_dataset"" (\n\t""C/A""\tTEXT,\n..."


In [8]:
df = pd.read_sql('select * from mta_dataset;',engine)

In [9]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,07/31/2021,00:00:00,REGULAR,7611181,2603110
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,07/31/2021,04:00:00,REGULAR,7611192,2603113
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,07/31/2021,08:00:00,REGULAR,7611197,2603126
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,07/31/2021,12:00:00,REGULAR,7611235,2603178
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,07/31/2021,16:00:00,REGULAR,7611357,2603211


In [10]:
# This script will fetch the data and produce an insightful summary
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    return summary

In [11]:
resumetable(df)

Dataset Shape: (2726588, 11)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value
0,C/A,object,0,751,A002,A002,A002
1,UNIT,object,0,469,R051,R051,R051
2,SCP,object,0,224,02-00-00,02-00-00,02-00-00
3,STATION,object,0,379,59 ST,59 ST,59 ST
4,LINENAME,object,0,114,NQR456W,NQR456W,NQR456W
5,DIVISION,object,0,6,BMT,BMT,BMT
6,DATE,object,0,91,07/31/2021,07/31/2021,07/31/2021
7,TIME,object,0,57061,00:00:00,04:00:00,08:00:00
8,DESC,object,0,2,REGULAR,REGULAR,REGULAR
9,ENTRIES,int64,0,1833843,7611181,7611192,7611197


### Data Cleaning

#### Manage Column
   - Standardize column names
   - convert column datatype
   - drop unnecessary rows and columns

In [12]:
def data_formatting(df):
    # Rename columns to remove whitespace
    df.columns = [column.strip() for column in df.columns]
    
    # Create DATETIME column from 'DATE' and 'TIME' columns
    df['DATETIME'] = pd.to_datetime(df['DATE'] + " " + df['TIME'],
                                   format="%m/%d/%Y %H:%M:%S")
    
    # Create DAY_OF_WEEK column from DATETIME
    df['DAY_OF_WEEK'] = pd.to_datetime(df['DATETIME']).dt.dayofweek
    
    # Change dtypes
    df['ENTRIES'] = df.ENTRIES.astype("int")
    df['EXITS'] = df.EXITS.astype("int")
    
    # Remove non-REGULAR values from 'DESC'
    df = df.drop(df.loc[df.DESC != 'REGULAR'].index)
    
    # Remove rows for PATH, RIT, and SRT since only checking on subway data
    df.drop(df.loc[df['DIVISION'].isin(['PTH','RIT','SRT'])].index, inplace=True)
    
    # Drop unnecessary columns
    df = df.drop(['LINENAME', 'DIVISION', 'DESC', 'DATE', 'TIME'], axis=1)
    
    return df

In [13]:
df2 = data_formatting(df)
df2.head(20)

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK
0,A002,R051,02-00-00,59 ST,7611181,2603110,2021-07-31 00:00:00,5
1,A002,R051,02-00-00,59 ST,7611192,2603113,2021-07-31 04:00:00,5
2,A002,R051,02-00-00,59 ST,7611197,2603126,2021-07-31 08:00:00,5
3,A002,R051,02-00-00,59 ST,7611235,2603178,2021-07-31 12:00:00,5
4,A002,R051,02-00-00,59 ST,7611357,2603211,2021-07-31 16:00:00,5
5,A002,R051,02-00-00,59 ST,7611512,2603241,2021-07-31 20:00:00,5
6,A002,R051,02-00-00,59 ST,7611600,2603256,2021-08-01 00:00:00,6
7,A002,R051,02-00-00,59 ST,7611605,2603259,2021-08-01 04:00:00,6
8,A002,R051,02-00-00,59 ST,7611617,2603272,2021-08-01 08:00:00,6
9,A002,R051,02-00-00,59 ST,7611656,2603306,2021-08-01 12:00:00,6


#### Manage Duplicate Data

   - Detecting duplicates
   - Handle duplicates

In [14]:
# Checking for duplicates for a given turnstile for a given datetime
(df2
.groupby(['C/A','UNIT','SCP','STATION','DATETIME'])
.ENTRIES.count()
.reset_index()
.sort_values('ENTRIES', ascending=False)).head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2021-07-31 00:00:00,1
1682892,R158,R084,00-05-01,59 ST COLUMBUS,2021-09-01 00:00:00,1
1682885,R158,R084,00-05-01,59 ST COLUMBUS,2021-08-30 20:00:00,1
1682886,R158,R084,00-05-01,59 ST COLUMBUS,2021-08-31 00:00:00,1
1682887,R158,R084,00-05-01,59 ST COLUMBUS,2021-08-31 04:00:00,1


In [15]:
df2.sort_values(['C/A', 'UNIT', 'SCP', 'STATION', 'DATETIME'],
                      inplace=True, ascending=True)
df2.head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK
0,A002,R051,02-00-00,59 ST,7611181,2603110,2021-07-31 00:00:00,5
1,A002,R051,02-00-00,59 ST,7611192,2603113,2021-07-31 04:00:00,5
2,A002,R051,02-00-00,59 ST,7611197,2603126,2021-07-31 08:00:00,5
3,A002,R051,02-00-00,59 ST,7611235,2603178,2021-07-31 12:00:00,5
4,A002,R051,02-00-00,59 ST,7611357,2603211,2021-07-31 16:00:00,5
5,A002,R051,02-00-00,59 ST,7611512,2603241,2021-07-31 20:00:00,5
6,A002,R051,02-00-00,59 ST,7611600,2603256,2021-08-01 00:00:00,6
7,A002,R051,02-00-00,59 ST,7611605,2603259,2021-08-01 04:00:00,6
8,A002,R051,02-00-00,59 ST,7611617,2603272,2021-08-01 08:00:00,6
9,A002,R051,02-00-00,59 ST,7611656,2603306,2021-08-01 12:00:00,6


### Data Preparation

#### Make sense of entry and exit counts
   - counter resets
   - counter counting backwards

In [16]:
# Create previous datetime, entries and exits columns
df2[['PREV_DATETIME', "PREV_ENTRIES", "PREV_EXITS"]] = (df2
                                                .groupby(['C/A', 'UNIT', 'SCP', 'STATION'])\
                                                       ['DATETIME','ENTRIES','EXITS']
                                                .apply(lambda grp: grp.shift(1)))

  df2[['PREV_DATETIME', "PREV_ENTRIES", "PREV_EXITS"]] = (df2


In [17]:
# Drop the rows for the earliest times in the df
df2.dropna(subset=["PREV_DATETIME"], axis=0, inplace=True)
df2.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS
1,A002,R051,02-00-00,59 ST,7611192,2603113,2021-07-31 04:00:00,5,2021-07-31 00:00:00,7611181.0,2603110.0
2,A002,R051,02-00-00,59 ST,7611197,2603126,2021-07-31 08:00:00,5,2021-07-31 04:00:00,7611192.0,2603113.0
3,A002,R051,02-00-00,59 ST,7611235,2603178,2021-07-31 12:00:00,5,2021-07-31 08:00:00,7611197.0,2603126.0
4,A002,R051,02-00-00,59 ST,7611357,2603211,2021-07-31 16:00:00,5,2021-07-31 12:00:00,7611235.0,2603178.0
5,A002,R051,02-00-00,59 ST,7611512,2603241,2021-07-31 20:00:00,5,2021-07-31 16:00:00,7611357.0,2603211.0


In [18]:
# Functions for entry, exit counts, and traffic counts
def get_entry_counts(row, max_counter):
    counter = abs(row['ENTRIES'] - row['PREV_ENTRIES'])
    # Set anomaly values due to reset of counters to the uniform NaN values
    if counter > max_counter:
        counter = np.nan
    return counter

def get_exit_counts(row, max_counter):
    counter = abs(row['EXITS'] - row['PREV_EXITS'])
    # Set anomaly values due to reset of counters to the uniform NaN values
    if counter > max_counter:
        counter = np.nan
    return counter

def get_counts(df):
    # Set max_counter = 14400, assuming only 1/person/sec/turnstile at a time in a 4HR interval
    # Entry count
    df['entry_count'] = df.apply(get_entry_counts, axis=1, max_counter=14400)
    
    # Exit count
    df['exit_count'] = df.apply(get_exit_counts, axis=1, max_counter=14400)
    
    return df


In [19]:
df3 = get_counts(df2)
df3.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count
1,A002,R051,02-00-00,59 ST,7611192,2603113,2021-07-31 04:00:00,5,2021-07-31 00:00:00,7611181.0,2603110.0,11.0,3.0
2,A002,R051,02-00-00,59 ST,7611197,2603126,2021-07-31 08:00:00,5,2021-07-31 04:00:00,7611192.0,2603113.0,5.0,13.0
3,A002,R051,02-00-00,59 ST,7611235,2603178,2021-07-31 12:00:00,5,2021-07-31 08:00:00,7611197.0,2603126.0,38.0,52.0
4,A002,R051,02-00-00,59 ST,7611357,2603211,2021-07-31 16:00:00,5,2021-07-31 12:00:00,7611235.0,2603178.0,122.0,33.0
5,A002,R051,02-00-00,59 ST,7611512,2603241,2021-07-31 20:00:00,5,2021-07-31 16:00:00,7611357.0,2603211.0,155.0,30.0


In [20]:
# Cleaning NaN values in entry_count
# For each NaN values, replace it with the mean of values before and after the NaN value
entry_list = list(df3['entry_count'])
ind = 0
for i in entry_list:
    if np.isnan(i) == 1:
        entry_list[ind] = np.nanmean([entry_list[ind-2],entry_list[ind-1],entry_list[ind+1],entry_list[ind+2]])
    ind += 1
    
df3['ENTRY_DIFF'] = entry_list

In [21]:
df3['ENTRY_DIFF'].isna().sum()

0

In [22]:
# Cleaning NaN values in exit_count
# For each NaN values, replace it with the mean of values before and after the NaN value
exit_list = list(df3['exit_count'])
ind = 0
for i in exit_list:
    if np.isnan(i) == 1:
        exit_list[ind] = np.nanmean([exit_list[ind-2],exit_list[ind-1],exit_list[ind+1],exit_list[ind+2]])
    ind += 1
    
df3['EXIT_DIFF'] = exit_list

In [23]:
df3['EXIT_DIFF'].isna().sum()

0

In [24]:
# Get total traffic count
df3['TOTAL_TRAFFIC'] = df3['ENTRY_DIFF'] + df3['EXIT_DIFF']
df3.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATETIME,DAY_OF_WEEK,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
1,A002,R051,02-00-00,59 ST,7611192,2603113,2021-07-31 04:00:00,5,2021-07-31 00:00:00,7611181.0,2603110.0,11.0,3.0,11.0,3.0,14.0
2,A002,R051,02-00-00,59 ST,7611197,2603126,2021-07-31 08:00:00,5,2021-07-31 04:00:00,7611192.0,2603113.0,5.0,13.0,5.0,13.0,18.0
3,A002,R051,02-00-00,59 ST,7611235,2603178,2021-07-31 12:00:00,5,2021-07-31 08:00:00,7611197.0,2603126.0,38.0,52.0,38.0,52.0,90.0
4,A002,R051,02-00-00,59 ST,7611357,2603211,2021-07-31 16:00:00,5,2021-07-31 12:00:00,7611235.0,2603178.0,122.0,33.0,122.0,33.0,155.0
5,A002,R051,02-00-00,59 ST,7611512,2603241,2021-07-31 20:00:00,5,2021-07-31 16:00:00,7611357.0,2603211.0,155.0,30.0,155.0,30.0,185.0


In [26]:
# Check TIME intervals
df3['TIME'] = df3['DATETIME'].apply(lambda x: x.time())
print('-'*25)
print(df3.TIME.unique())
print('*'*30)
print(df3.TIME.value_counts())

-------------------------
[datetime.time(4, 0) datetime.time(8, 0) datetime.time(12, 0) ...
 datetime.time(11, 2, 50) datetime.time(11, 4, 11)
 datetime.time(8, 26, 50)]
******************************
20:00:00    230423
12:00:00    230348
16:00:00    230318
04:00:00    230157
08:00:00    230144
             ...  
17:10:14         2
17:11:28         2
17:09:13         1
17:37:03         1
17:12:29         1
Name: TIME, Length: 1371, dtype: int64


In [27]:
# Turn all time intervals to 4-hour frequency
df3 = df3.groupby(['C/A', 'UNIT', 'SCP', 'STATION',
                  pd.Grouper(key='DATETIME', freq='4H'),
                  'DAY_OF_WEEK', 'TIME']).sum().reset_index()
# Double checking on time intervals
df3['TIME'] = df3['DATETIME'].apply(lambda x: x.time())
print(df3.TIME.unique())

[datetime.time(4, 0) datetime.time(8, 0) datetime.time(12, 0)
 datetime.time(16, 0) datetime.time(20, 0) datetime.time(0, 0)]


In [33]:
# Syncing Day_of_week column to the new datetime
df3['DAY_OF_WEEK'] = pd.to_datetime(df3['DATETIME']).dt.dayofweek
df3.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,DAY_OF_WEEK,TIME,ENTRIES,EXITS,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,A002,R051,02-00-00,59 ST,2021-07-31 04:00:00,5,04:00:00,7611192,2603113,7611181.0,2603110.0,11.0,3.0,11.0,3.0,14.0
1,A002,R051,02-00-00,59 ST,2021-07-31 08:00:00,5,08:00:00,7611197,2603126,7611192.0,2603113.0,5.0,13.0,5.0,13.0,18.0
2,A002,R051,02-00-00,59 ST,2021-07-31 12:00:00,5,12:00:00,7611235,2603178,7611197.0,2603126.0,38.0,52.0,38.0,52.0,90.0
3,A002,R051,02-00-00,59 ST,2021-07-31 16:00:00,5,16:00:00,7611357,2603211,7611235.0,2603178.0,122.0,33.0,122.0,33.0,155.0
4,A002,R051,02-00-00,59 ST,2021-07-31 20:00:00,5,20:00:00,7611512,2603241,7611357.0,2603211.0,155.0,30.0,155.0,30.0,185.0


In [29]:
#Sanity check
df3.groupby(['C/A','UNIT','SCP','STATION','DATETIME']).sum().reset_index()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,DAY_OF_WEEK,ENTRIES,EXITS,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,A002,R051,02-00-00,59 ST,2021-07-31 04:00:00,5,7611192,2603113,7.611181e+06,2.603110e+06,11.0,3.0,11.0,3.0,14.0
1,A002,R051,02-00-00,59 ST,2021-07-31 08:00:00,5,7611197,2603126,7.611192e+06,2.603113e+06,5.0,13.0,5.0,13.0,18.0
2,A002,R051,02-00-00,59 ST,2021-07-31 12:00:00,5,7611235,2603178,7.611197e+06,2.603126e+06,38.0,52.0,38.0,52.0,90.0
3,A002,R051,02-00-00,59 ST,2021-07-31 16:00:00,5,7611357,2603211,7.611235e+06,2.603178e+06,122.0,33.0,122.0,33.0,155.0
4,A002,R051,02-00-00,59 ST,2021-07-31 20:00:00,5,7611512,2603241,7.611357e+06,2.603211e+06,155.0,30.0,155.0,30.0,185.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2509001,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 04:00:00,4,1559789619,1728260617,1.559790e+09,1.728261e+09,2.0,0.0,2.0,0.0,2.0
2509002,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 08:00:00,4,1559789489,1728260640,1.559790e+09,1.728261e+09,130.0,23.0,130.0,23.0,153.0
2509003,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 12:00:00,4,1559789470,1728260649,1.559789e+09,1.728261e+09,19.0,9.0,19.0,9.0,28.0
2509004,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 16:00:00,4,1559789402,1728260696,1.559789e+09,1.728261e+09,68.0,47.0,68.0,47.0,115.0


In [30]:
#Sanity Check
df2.groupby(['C/A','UNIT','SCP','STATION','DATETIME']).sum().reset_index()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES,EXITS,DAY_OF_WEEK,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,A002,R051,02-00-00,59 ST,2021-07-31 04:00:00,7611192,2603113,5,7.611181e+06,2.603110e+06,11.0,3.0,11.0,3.0,14.0
1,A002,R051,02-00-00,59 ST,2021-07-31 08:00:00,7611197,2603126,5,7.611192e+06,2.603113e+06,5.0,13.0,5.0,13.0,18.0
2,A002,R051,02-00-00,59 ST,2021-07-31 12:00:00,7611235,2603178,5,7.611197e+06,2.603126e+06,38.0,52.0,38.0,52.0,90.0
3,A002,R051,02-00-00,59 ST,2021-07-31 16:00:00,7611357,2603211,5,7.611235e+06,2.603178e+06,122.0,33.0,122.0,33.0,155.0
4,A002,R051,02-00-00,59 ST,2021-07-31 20:00:00,7611512,2603241,5,7.611357e+06,2.603211e+06,155.0,30.0,155.0,30.0,185.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519674,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 05:00:00,1559789619,1728260617,4,1.559790e+09,1.728261e+09,2.0,0.0,2.0,0.0,2.0
2519675,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 09:00:00,1559789489,1728260640,4,1.559790e+09,1.728261e+09,130.0,23.0,130.0,23.0,153.0
2519676,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 13:00:00,1559789470,1728260649,4,1.559789e+09,1.728261e+09,19.0,9.0,19.0,9.0,28.0
2519677,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 17:00:00,1559789402,1728260696,4,1.559789e+09,1.728261e+09,68.0,47.0,68.0,47.0,115.0


#### Final clean-up and save as csv file

In [31]:
# Last Sanity Check before final clean up
(df3
.groupby(['C/A', 'UNIT', 'SCP', 'STATION', 'DATETIME', 'DAY_OF_WEEK'])
.sum()
.reset_index())

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,DAY_OF_WEEK,ENTRIES,EXITS,PREV_ENTRIES,PREV_EXITS,entry_count,exit_count,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,A002,R051,02-00-00,59 ST,2021-07-31 04:00:00,5,7611192,2603113,7.611181e+06,2.603110e+06,11.0,3.0,11.0,3.0,14.0
1,A002,R051,02-00-00,59 ST,2021-07-31 08:00:00,5,7611197,2603126,7.611192e+06,2.603113e+06,5.0,13.0,5.0,13.0,18.0
2,A002,R051,02-00-00,59 ST,2021-07-31 12:00:00,5,7611235,2603178,7.611197e+06,2.603126e+06,38.0,52.0,38.0,52.0,90.0
3,A002,R051,02-00-00,59 ST,2021-07-31 16:00:00,5,7611357,2603211,7.611235e+06,2.603178e+06,122.0,33.0,122.0,33.0,155.0
4,A002,R051,02-00-00,59 ST,2021-07-31 20:00:00,5,7611512,2603241,7.611357e+06,2.603211e+06,155.0,30.0,155.0,30.0,185.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2509001,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 04:00:00,4,1559789619,1728260617,1.559790e+09,1.728261e+09,2.0,0.0,2.0,0.0,2.0
2509002,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 08:00:00,4,1559789489,1728260640,1.559790e+09,1.728261e+09,130.0,23.0,130.0,23.0,153.0
2509003,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 12:00:00,4,1559789470,1728260649,1.559789e+09,1.728261e+09,19.0,9.0,19.0,9.0,28.0
2509004,R730,R431,00-00-04,EASTCHSTER/DYRE,2021-10-29 16:00:00,4,1559789402,1728260696,1.559789e+09,1.728261e+09,68.0,47.0,68.0,47.0,115.0


In [32]:
# Drop unnecessary columns
turnstile_df = df3.drop(['C/A', 'UNIT', 'SCP',
                         'ENTRIES', 'EXITS', 'TIME',
                         'PREV_ENTRIES', 'PREV_EXITS', 
                         'entry_count', 'exit_count'], axis=1)
print(turnstile_df.shape)
turnstile_df.head()

(2519679, 6)


Unnamed: 0,STATION,DATETIME,DAY_OF_WEEK,ENTRY_DIFF,EXIT_DIFF,TOTAL_TRAFFIC
0,59 ST,2021-07-31 04:00:00,5,11.0,3.0,14.0
1,59 ST,2021-07-31 08:00:00,5,5.0,13.0,18.0
2,59 ST,2021-07-31 12:00:00,5,38.0,52.0,90.0
3,59 ST,2021-07-31 16:00:00,5,122.0,33.0,155.0
4,59 ST,2021-07-31 20:00:00,5,155.0,30.0,185.0


In [34]:
turnstile_df.to_csv('cleaned_mta_df.csv')