# Notes
Assumptions:
Time Constraints:
Nonprofit is trying to garner interest for gala happening around the beggining of the summer, we assume street teams would be out canvassing in the three - six preceding months. Jan - June.

Counter Values: Assume 'entries' and 'exits' columns reflect cumulative counts that could only increase as time moved forward. Thus, we removed any rows with negative values in differential columns (Approximately X% of the rows)

Target Metrics:
Did not differentiate between entries and exits for a station, but rather relied on 'total_traffic' to determine which station would have the most foot traffic at a given time.

Steps:
Read data from turnstile
Identify possible missing data
Update Date to Day of Week 
Sort data by date and time
Find differential between time stamp of entries and exists



In [92]:
#Import required packages
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import seaborn as sns

In [107]:
#Reads files in turnstile directory into a files list 
from os import listdir
from os.path import isfile, join
files_Test = [f for f in listdir('./Turnstile_data') if isfile(join('./Turnstile_data', f))]

In [111]:
# Create initial DataFrame
dfs = pd.DataFrame()
for file in files:
    print("Loading {}".format(str("./Turnstile_data/"+file)))
    df = pd.read_csv('./Turnstile_data/' + file)
    dfs = dfs.append(df)    

dfs.info()  

Loading ./Turnstile_data/turnstile_180505.txt
Loading ./Turnstile_data/turnstile_180512.txt
Loading ./Turnstile_data/turnstile_180519.txt
Loading ./Turnstile_data/turnstile_180526.txt
Loading ./Turnstile_data/turnstile_180602.txt
Loading ./Turnstile_data/turnstile_180609.txt
Loading ./Turnstile_data/turnstile_180616.txt
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1379121 entries, 0 to 196977
Data columns (total 11 columns):
C/A                                                                     1379121 non-null object
UNIT                                                                    1379121 non-null object
SCP                                                                     1379121 non-null object
STATION                                                                 1379121 non-null object
LINENAME                                                                1379121 non-null object
DIVISION                                                                1379121 non-nu

In [116]:
df.head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/09/2018,00:00:00,REGULAR,6649975,2254182
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/09/2018,04:00:00,REGULAR,6650004,2254188
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/09/2018,08:00:00,REGULAR,6650022,2254209
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/09/2018,12:00:00,REGULAR,6650118,2254289
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/09/2018,16:00:00,REGULAR,6650332,2254345


In [167]:
#Function to preprocess data
def preprocess(df):
    #Standardize column names
    df.columns = df.columns.str.strip()
    
    # Create DATETIME column from 'DATE' and 'TIME' cols
    df['DATETIME'] = pd.to_datetime(df.DATE + " " + df.TIME, 
                                    format="%m/%d/%Y %H:%M:%S")

    #Create day of week 'DOW' column from data column
    df['DOW']= df['DATE'].map(lambda x: returnDay(x))
    
    #Drop Unncessary columns
    df = df.drop(['LINENAME', 'DIVISION', 'DATE'], axis=1)   
    
    print("prior shape: ", df.shape)
    
    # Remove non 'REGULAR' audits from Desc column 
    df.drop(df.DESC != 'REGULAR', inplace = True)
    
    print("post shape: ", df.shape)
    
    return df
    
#Function for modifying date string to day of week in 'mm/dd/year' string
#This function can be improved on significantly
def returnDay(dataString):
    #Define weekdays as tuple
    weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")   
    
    data = dataString
    month = int(dataString[0:2])
    day = int(dataString[3:5])
    year = int(dataString[6:12])
    dayOfWeek = weekDays[datetime.date(year,month,day).weekday()]
    return dayOfWeek
 

In [168]:
df = preprocess(dfs)

prior shape:  (1379121, 10)
post shape:  (1379107, 10)


In [178]:
df_test = df.sort_values(['STATION','TIME', 'DATETIME'], ascending=[True, True, True])
df.head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,TIME,DESC,ENTRIES,EXITS,DOW,DATETIME,diffs_entries,diffs_exit
2,A002,R051,02-00-00,59 ST,08:00:00,REGULAR,6598880,2235863,Saturday,2018-04-28 08:00:00,,
3,A002,R051,02-00-00,59 ST,12:00:00,REGULAR,6598961,2235955,Saturday,2018-04-28 12:00:00,81.0,92.0
4,A002,R051,02-00-00,59 ST,16:00:00,REGULAR,6599175,2236015,Saturday,2018-04-28 16:00:00,214.0,60.0
5,A002,R051,02-00-00,59 ST,20:00:00,REGULAR,6599456,2236074,Saturday,2018-04-28 20:00:00,281.0,59.0
6,A002,R051,02-00-00,59 ST,00:00:00,REGULAR,6599584,2236102,Sunday,2018-04-29 00:00:00,128.0,28.0


In [184]:
df_test.groupby(['TIME','STATION','ENTRIES']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EXITS,diffs_entries,diffs_exit
TIME,STATION,ENTRIES,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00:00:00,1 AV,88741,197233,-2423072.0,-14160940.0
00:00:00,1 AV,90821,199804,545.0,587.0
00:00:00,1 AV,92143,201812,307.0,325.0
00:00:00,1 AV,93557,205732,335.0,466.0
00:00:00,1 AV,95154,209260,459.0,494.0
00:00:00,1 AV,96910,212928,475.0,487.0
00:00:00,1 AV,98853,216644,506.0,520.0
00:00:00,1 AV,100675,220474,-2416599.0,-14169013.0
00:00:00,1 AV,102411,223252,510.0,602.0
00:00:00,1 AV,103652,225233,278.0,325.0


In [174]:
#Determine how many datapoints are missing using isnull
missing_datapoints_sum = df.isnull().sum().sum()
print(missing_datapoints_sum)

2


In [171]:
#Entry and exit data are cumulative, need to adjust to periodic interval counts.
df['diffs_entries'] = df['ENTRIES'].diff()
df['diffs_exit'] = df['EXITS'].diff()