In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import pandas as pd
import time
import datetime
import re
import sqlite3 as lite

# Data Collection and Integration

In [2]:
url = 'http://web.mta.info/developers/turnstile.html'
chromedriver = "/Users/Miya/Downloads/chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.get(url)
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')


# extract data link
pattern = re.compile('^data.')
data_list =  []
for link in soup.find_all('a', href=True):
    if re.match(pattern,link['href']):
        data_list.append('http://web.mta.info/developers/'+link['href'])

In [3]:
#check: data should range from 05/05/2010 to 04/22/2017
print(data_list[0])
print(data_list[-1][-10:-4])

http://web.mta.info/developers/data/nyct/turnstile/turnstile_170422.txt
100505


Two data schemas for before and after 10/18/14. Therefore, we needs to locate from where schema changes

In [5]:
for i, v in enumerate(data_list):
    if v[-10:] == '141011.txt':
        print(i)

132


In [4]:
data_list_prior = data_list[132:]
data_list_post = data_list[:132]

let us create two tables to store the data 

In [None]:
conPost = lite.connect('post.db')
conPre = lite.connect('pre1018.db')

**A.** Collect and integrate data ** AFTER** 10/18/14 

In [None]:
record_cnt = 0  
for link in data_list_post:
    data = pd.read_table(link, sep=',')
    print('%s:%s rows %s columns' % (link[-10:-4],data.shape[0], data.shape[1])) #printing out values makes me feel safe....
    record_cnt += data.shape[0]
    data.to_sql(name='post', con=conPost, flavor='sqlite', if_exists='append')

Check we don't miss anything

In [23]:
with conPost:
    cur = conPost.cursor()
    cur.execute("""select count(*) from post""")
    cnt = cur.fetchall()
    print(cnt)
    print(record_cnt)

[(25304899,)]
25304899


**B.** Collect and integrate data ** BEFORE** 10/18/14 

In [None]:
record_cnt = 0 

col_names = ["C/A","UNIT","SCP","DATE","TIME","DESC","ENTRIES","EXITS"]
for link in data_list_prior:    
    data = pd.read_table(link, sep=',',header = None,usecols = [0,1,2,3,4,5,6,7],names = col_names)
    print('%s:%s rows %s columns' % (link[-10:-4],data.shape[0], data.shape[1]))
    record_cnt += data.shape[0]
    data.to_sql(name='pre1018', con=conPre, flavor='sqlite', if_exists='append')

with conPre:
    cur = conPre.cursor()
    cur.execute("""select count(*) from pre1018""")
    cnt = cur.fetchall()

In [14]:
print(cnt)
print(record_cnt)

[(6747078,)]
6747078


In [69]:
col_names = ["UNIT",'C/A','STATION','LINENAME','DIVISION']
remote = pd.read_excel("http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls", names=col_names).drop_duplicates(['UNIT','C/A'])
remote.to_sql(name='remote', con=conPre, flavor='sqlite', if_exists='replace')

with conPre:
    cur = con.cursor()
    cur.execute("""select count(*) from remote""")
    cnt = cur.fetchall()
    print(cnt)

[(768,)]


In [71]:
with conPre:
    cur = con.cursor()
    cur.execute("""select count(*) from pre1018 left join remote on pre1018.UNIT = remote.UNIT AND pre1018.[C/A] = remote.[C/A] """)
    join_cnt = cur.fetchall()
    print(join_cnt)

[(6747078,)]


In [76]:
with conPre:
    cur = con.cursor()
    cur.execute("""Create table prior as select * from pre1018 left join remote on pre1018.UNIT = remote.UNIT AND pre1018.[C/A] = remote.[C/A]""")
    #preJoin = cur.fetchall()

Now we have two databases: post.db and pre1018.db

1. **post.db** has **one** table called **post**, which stores complete information.

2. **pre1018.db** has **three** tables called **pre1018,remote and prior**. 

3. pre1018 and prior joined according to **C/A(BOOTH) and UNIT**, which gives us table prior. 

4. Table **prior** stores all information we need.

# Data Analysis

1. Which station has the most number of units?

In [None]:
con = lite.connect('post.db')
with con:
    cur = con.cursor()
    cur.execute("""select station, count(unit) from post group by station order by count(unit) desc limit 1""")
    top_station = cur.fetchall()
    print('%s has the most number of units' % top_station)

In [6]:
pd.read_table('http://web.mta.info/developers/data/nyct/turnstile/turnstile_100505.txt', sep=',')

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/15/2017,00:00:00,REGULAR,6136580,2078941
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/15/2017,04:00:00,REGULAR,6136613,2078947
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/15/2017,08:00:00,REGULAR,6136639,2078987
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/15/2017,12:00:00,REGULAR,6136799,2079109
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/15/2017,16:00:00,REGULAR,6137076,2079173
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/15/2017,20:00:00,REGULAR,6137520,2079204
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/16/2017,00:00:00,REGULAR,6137732,2079231
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/16/2017,04:00:00,REGULAR,6137757,2079236
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/16/2017,08:00:00,REGULAR,6137784,2079258
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/16/2017,12:00:00,REGULAR,6137867,2079326


In [6]:
col_names = ["C/A","UNIT","SCP","DATE1","TIME1","DESC1","ENTRIES1","EXITS1","DATE2",
             "TIME2","DESC2","ENTRIES2","EXITS2","DATE3","TIME3","DESC3","ENTRIES3",
             "EXITS3","DATE4","TIME4","DESC4","ENTRIES4","EXITS4","DATE5","TIME5","DESC5",
             "ENTRIES5","EXITS5","DATE6","TIME6","DESC6","ENTRIES6","EXITS6","DATE7","TIME7",
             "DESC7","ENTRIES7","EXITS7","DATE8","TIME8","DESC8","ENTRIES8","EXITS8"]
pd.read_table('http://web.mta.info/developers/data/nyct/turnstile/turnstile_131109.txt', sep=',',
              header = None,usecols = [0,1,2,3,4,5,6,7])#names = col_names)

Unnamed: 0,0,1,2,3,4,5,6,7
0,A002,R051,02-00-00,11-02-13,00:00:00,REGULAR,4343831,1484036
1,A002,R051,02-00-00,11-03-13,07:00:00,REGULAR,4344869,1484285
2,A002,R051,02-00-00,11-04-13,15:00:00,REGULAR,4346138,1484918
3,A002,R051,02-00-00,11-05-13,23:00:00,REGULAR,4348999,1485481
4,A002,R051,02-00-00,11-06-13,11:02:30,DOOR OPEN,4349231,1485833
5,A002,R051,02-00-00,11-07-13,03:00:00,REGULAR,4350818,1486029
6,A002,R051,02-00-00,11-08-13,11:00:00,REGULAR,4352754,1486922
7,A002,R051,02-00-01,11-02-13,00:00:00,REGULAR,4081675,889173
8,A002,R051,02-00-01,11-03-13,07:00:00,REGULAR,4082588,889354
9,A002,R051,02-00-01,11-04-13,15:00:00,REGULAR,4083819,889736
