In [11]:
# import dependencies
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium import webdriver
from zipfile import ZipFile
import urllib.request
import os

In [14]:
# since we can't use requests to get links from the html,
# we must use chromedriver to emulate visiting the page
browser = webdriver.Chrome()
browser.get('https://s3.amazonaws.com/tripdata/index.html')

In [34]:
# get all the anchors in the page
anchors = browser.find_elements_by_tag_name('a')

# create list of all the links
links_all = [link.get_attribute('href') for link in anchors]

# create list of 2019 NYC links
links_2019 = [link for link in links_all if 'JC' not in link and '2019' in link]

# preview links
links_2019

['https://s3.amazonaws.com/tripdata/201901-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201902-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201903-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201904-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201905-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201906-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201907-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201908-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201909-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201910-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201911-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201912-citibike-tripdata.csv.zip']

In [35]:
link = links_2019[0]
# open link, download as temporary zip file, create dataframe
url = urllib.request.urlopen(link)
temp_file = open('temp.zip', 'wb').write(url.read()).close()

In [36]:
df = pd.read_csv('temp.zip')

In [41]:
# rename columns
df.columns = ['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID', 'Start Station Name', 'Start Station Latitude', 'Start Station Longitude', 'End Station ID', 'End Station Name', 'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type', 'Birth Year', 'Gender']

In [47]:
# function for creating dataframe for each link
def create_df(link):
    
    # open link, download as temporary zip file, and create dataframe
    url = urllib.request.urlopen(link)
    temp_file = open('temp.zip', 'wb').write(url.read())
    df = pd.read_csv('temp.zip')
        
    # rename columns
    df.columns = ['Trip Duration', 
                  'Start Time', 
                  'Stop Time', 
                  'Start Station ID', 
                  'Start Station Name', 
                  'Start Station Latitude', 
                  'Start Station Longitude', 
                  'End Station ID', 
                  'End Station Name', 
                  'End Station Latitude', 
                  'End Station Longitude', 
                  'Bike ID', 
                  'User Type', 
                  'Birth Year', 
                  'Gender']
    
    # print link for confirmation
    print(link)
    
    # remove temperary file
    os.remove('temp.zip')
    
    return df

In [55]:
df.dtypes

Trip Duration                int64
Start Time                  object
Stop Time                   object
Start Station ID           float64
Start Station Name          object
Start Station Latitude     float64
Start Station Longitude    float64
End Station ID             float64
End Station Name            object
End Station Latitude       float64
End Station Longitude      float64
Bike ID                      int64
User Type                   object
Birth Year                   int64
Gender                       int64
dtype: object

In [77]:
# drop NAs
df.dropna(how='any').reset_index(drop=True)

# convert start/stop times to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Stop Time'] = pd.to_datetime(df['Stop Time'])
df.dtypes

# add age column
df['Age'] = 2019 - df['Birth Year']

In [79]:
df.describe()

Unnamed: 0,Start Station ID,Start Station Latitude,Start Station Longitude,End Station ID,End Station Latitude,End Station Longitude,Bike ID,Birth Year,Gender,Age
count,967269.0,967287.0,967287.0,967269.0,967287.0,967287.0,967287.0,967287.0,967287.0,967287.0
mean,1620.420794,40.737268,-73.982138,1608.750936,40.737125,-73.98235,28917.729181,1978.841311,1.177713,40.158689
std,1454.90683,0.030353,0.018547,1453.416308,0.030117,0.018604,6223.319197,12.205457,0.465508,12.205457
min,72.0,40.6554,-74.017134,72.0,40.6554,-74.046305,14529.0,1886.0,0.0,16.0
25%,383.0,40.71817,-73.994564,382.0,40.71817,-73.994618,25940.0,1969.0,1.0,30.0
50%,508.0,40.739323,-73.984907,507.0,40.739126,-73.98518,30817.0,1981.0,1.0,38.0
75%,3263.0,40.757148,-73.972826,3263.0,40.7568,-73.973442,34031.0,1989.0,1.0,50.0
max,3728.0,40.863,-73.884,3728.0,40.866,-73.881,37889.0,2003.0,2.0,133.0


In [80]:
# remove trips with a duration longer than 24 hours
df['Trip Duration'] = df[df['Trip Duration'] < 86400]

# remove trips with riders over age 100 (I mean come on)
df['Age'] = df[df['Age'] < 100]

In [84]:
df.dtypes

Trip Duration                      object
Start Time                 datetime64[ns]
Stop Time                  datetime64[ns]
Start Station ID                  float64
Start Station Name                 object
Start Station Latitude            float64
Start Station Longitude           float64
End Station ID                    float64
End Station Name                   object
End Station Latitude              float64
End Station Longitude             float64
Bike ID                             int64
User Type                          object
Birth Year                          int64
Gender                              int64
Age                                object
dtype: object

In [76]:
# drop NAs after filtering the data
df.dropna(how='any').reset_index(drop=True).count()

Trip Duration              967044
Start Time                 967044
Stop Time                  967044
Start Station ID           967044
Start Station Name         967044
Start Station Latitude     967044
Start Station Longitude    967044
End Station ID             967044
End Station Name           967044
End Station Latitude       967044
End Station Longitude      967044
Bike ID                    967044
User Type                  967044
Birth Year                 967044
Gender                     967044
dtype: int64