# Notes
Assumptions:

---

Time Constraints:
Nonprofit is trying to generate interest for gala happening around the beggining of the summer, we assume street teams would be out canvassing in the three - six preceding months. Jan - June.

Counter Values: Assume 'entries' and 'exits' columns reflect cumulative counts that could only increase as time moved forward. Thus, we removed any rows with negative values in differential columns and values greater than 100000 (Approximately X% of the rows)

Target Metrics:
Did not differentiate between entries and exits for a station, but rather relied on 'total_traffic' to determine which station would have the most foot traffic at a given time.

Steps:
- Read data from turnstile
- Preprocess Data




In [1]:
#Import required packages
from datetime import datetime
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import seaborn as sns

In [2]:
#Reads files in turnstile directory into a files list 
from os import listdir
from os.path import isfile, join
files = [f for f in listdir('./Data/Turnstile_data') if isfile(join('./Data/Turnstile_data', f))]

In [3]:
# Create initial DataFrame
dfs = pd.DataFrame()
for file in files:
    print("Loading {}".format(str("./Data/Turnstile_data/"+file)))
    df = pd.read_csv('./Data/Turnstile_data/' + file)
    dfs = dfs.append(df)    

dfs.info()  
initial_shape = dfs.shape

Loading ./Data/Turnstile_data/turnstile_160507.txt
Loading ./Data/Turnstile_data/turnstile_160514.txt
Loading ./Data/Turnstile_data/turnstile_160521.txt
Loading ./Data/Turnstile_data/turnstile_160528.txt
Loading ./Data/Turnstile_data/turnstile_160604.txt
Loading ./Data/Turnstile_data/turnstile_160611.txt
Loading ./Data/Turnstile_data/turnstile_160618.txt
Loading ./Data/Turnstile_data/turnstile_170506.txt
Loading ./Data/Turnstile_data/turnstile_170513.txt
Loading ./Data/Turnstile_data/turnstile_170520.txt
Loading ./Data/Turnstile_data/turnstile_170527.txt
Loading ./Data/Turnstile_data/turnstile_170603.txt
Loading ./Data/Turnstile_data/turnstile_170610.txt
Loading ./Data/Turnstile_data/turnstile_170617.txt
Loading ./Data/Turnstile_data/turnstile_180505.txt
Loading ./Data/Turnstile_data/turnstile_180512.txt
Loading ./Data/Turnstile_data/turnstile_180519.txt
Loading ./Data/Turnstile_data/turnstile_180526.txt
Loading ./Data/Turnstile_data/turnstile_180602.txt
Loading ./Data/Turnstile_data/t

In [4]:
#Function to preprocess data
def preprocess(df):
    #Standardize column names
    df.columns = df.columns.str.strip()
    
    # Create DATETIME column from 'DATE' and 'TIME' cols
    df['DATETIME'] = pd.to_datetime(df.DATE + " " + df.TIME, 
                                    format="%m/%d/%Y %H:%M:%S")

    #Standardize dates, replace dates not in format MM/DD/YEAR to NaN and remove those rows
    df['DATE'] = pd.to_datetime(df['DATE'], format='%m/%d/%Y', errors='coerce')
    df.dropna(inplace=True)
    
    #Create day of week 'DOW' column from data column
    df['DATE'] = pd.to_datetime(df['DATE'])
    dfs['DOW'] = df['DATE'].dt.weekday_name
       
    #Drop Unncessary columns
    df = df.drop(['C/A','UNIT','LINENAME', 'DIVISION', 'DATE'], axis=1)   
    
    prior_shape = df.shape
    
    # Remove non 'REGULAR' audits from Desc column 
    df.drop(df.DESC != 'REGULAR', inplace = True)
    
    post_shape = df.shape
    desc_rows_removed_perc = (prior_shape[0]-post_shape[0]) / prior_shape[0]  * 100

    print("Percentage of Non Regular Data Removed  = {:08.6f} %".format(desc_rows_removed_perc))
    
    return df

In [5]:
df = preprocess(dfs)

Percentage of Non Regular Data Removed  = 0.001064 %


In [6]:
#Entry and exit data are cumulative, need to adjust to periodic interval counts.
df['DIFFS_ENTRIES'] = df['ENTRIES'].diff()
df['DIFFS_EXIT'] = df['EXITS'].diff()
df.head(5)

Unnamed: 0,SCP,STATION,TIME,DESC,ENTRIES,EXITS,DATETIME,DOW,DIFFS_ENTRIES,DIFFS_EXIT
2,02-00-00,59 ST,08:00:00,REGULAR,5640014.0,1910024.0,2016-04-30 08:00:00,Saturday,,
3,02-00-00,59 ST,12:00:00,REGULAR,5640158.0,1910134.0,2016-04-30 12:00:00,Saturday,144.0,110.0
4,02-00-00,59 ST,16:00:00,REGULAR,5640454.0,1910197.0,2016-04-30 16:00:00,Saturday,296.0,63.0
5,02-00-00,59 ST,20:00:00,REGULAR,5640802.0,1910254.0,2016-04-30 20:00:00,Saturday,348.0,57.0
6,02-00-00,59 ST,00:00:00,REGULAR,5640974.0,1910328.0,2016-05-01 00:00:00,Sunday,172.0,74.0


In [7]:
#Remove turnstile Data in DIFFS_ENTRIES and DIFFS_EXITS that is less than zero and greater than 1e5 (Borderline)
negative_diff_entries = df['DIFFS_ENTRIES'] < 0
df.loc[negative_diff_entries,'DIFFS_ENTRIES'] = np.nan

large_diff_entries =  df['DIFFS_ENTRIES'] > 1e5
df.loc[large_diff_entries,'DIFFS_ENTRIES'] = np.nan

negative_diff_exits = df['DIFFS_EXIT'] < 0
df.loc[negative_diff_exits,'DIFFS_EXIT'] = np.nan

large_diff_exits =  df['DIFFS_EXIT'] > 1e5
df.loc[large_diff_exits,'DIFFS_EXIT'] = np.nan

df.head(5)

Unnamed: 0,SCP,STATION,TIME,DESC,ENTRIES,EXITS,DATETIME,DOW,DIFFS_ENTRIES,DIFFS_EXIT
2,02-00-00,59 ST,08:00:00,REGULAR,5640014.0,1910024.0,2016-04-30 08:00:00,Saturday,,
3,02-00-00,59 ST,12:00:00,REGULAR,5640158.0,1910134.0,2016-04-30 12:00:00,Saturday,144.0,110.0
4,02-00-00,59 ST,16:00:00,REGULAR,5640454.0,1910197.0,2016-04-30 16:00:00,Saturday,296.0,63.0
5,02-00-00,59 ST,20:00:00,REGULAR,5640802.0,1910254.0,2016-04-30 20:00:00,Saturday,348.0,57.0
6,02-00-00,59 ST,00:00:00,REGULAR,5640974.0,1910328.0,2016-05-01 00:00:00,Sunday,172.0,74.0


In [8]:
#Add Column with Total Number of Individuals Entering and Exiting the Station
df['TOTAL_TRAFFIC'] = df['DIFFS_ENTRIES'] + df['DIFFS_EXIT']
df.head(5)

Unnamed: 0,SCP,STATION,TIME,DESC,ENTRIES,EXITS,DATETIME,DOW,DIFFS_ENTRIES,DIFFS_EXIT,TOTAL_TRAFFIC
2,02-00-00,59 ST,08:00:00,REGULAR,5640014.0,1910024.0,2016-04-30 08:00:00,Saturday,,,
3,02-00-00,59 ST,12:00:00,REGULAR,5640158.0,1910134.0,2016-04-30 12:00:00,Saturday,144.0,110.0,254.0
4,02-00-00,59 ST,16:00:00,REGULAR,5640454.0,1910197.0,2016-04-30 16:00:00,Saturday,296.0,63.0,359.0
5,02-00-00,59 ST,20:00:00,REGULAR,5640802.0,1910254.0,2016-04-30 20:00:00,Saturday,348.0,57.0,405.0
6,02-00-00,59 ST,00:00:00,REGULAR,5640974.0,1910328.0,2016-05-01 00:00:00,Sunday,172.0,74.0,246.0


In [9]:
#Remove Unused Data Columns
df.drop(['SCP','DESC','ENTRIES', 'EXITS', 'DIFFS_ENTRIES','DIFFS_EXIT'], axis=1, inplace=True)
df.head(5)

Unnamed: 0,STATION,TIME,DATETIME,DOW,TOTAL_TRAFFIC
2,59 ST,08:00:00,2016-04-30 08:00:00,Saturday,
3,59 ST,12:00:00,2016-04-30 12:00:00,Saturday,254.0
4,59 ST,16:00:00,2016-04-30 16:00:00,Saturday,359.0
5,59 ST,20:00:00,2016-04-30 20:00:00,Saturday,405.0
6,59 ST,00:00:00,2016-05-01 00:00:00,Sunday,246.0


In [10]:
#Percent of data equal to NaN
nulls = df['TOTAL_TRAFFIC'].isnull()
percent_null = len(df.loc[nulls,'TOTAL_TRAFFIC']) / len(df)
f'{percent_null:.2%} of the data is not useable'

'3.40% of the data is not useable'

In [11]:
#Remove Rows with NaN Values that are unusable
df.dropna(inplace=True)

In [12]:
#Assumption
#1-Remove Time Between Midnight and 6 am
#df = df.loc[(df.TIME > '06:00:00') | (df.TIME == '00:00:00')]

#df.sort_values(['STATION','TIME', 'DATETIME'], ascending=[True, True, True])
#post_shape = df.shape
#desc_rows_removed_perc = (1379121-post_shape[0]) / 1379121  * 100

#print("Percentage of Data Removed  = {:08.6f} %".format(desc_rows_removed_perc))