In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import datetime as dt

# Get data from website using scraping

In [2]:
URL = 'https://cycling.data.tfl.gov.uk/'
page = requests.get(URL)

#create bs4 object
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
job_elements = soup.find_all("tr")

- Webpage uses JS to load the table, so we will use selenium first to scrape the content and then BS4 to get links for files - before then using HTTP requests for the content

In [4]:
# commented out - the below code initially saved the flat HTML content in a html file, which is saved in the data folder

# driver = webdriver.Edge()
# driver.get(URL)
# driver.set_window_position(0, 0)
# driver.set_window_size(100000, 200000)
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(5) # wait to load

# # now print the response
# #print(driver.page_source)

# soup = BeautifulSoup(driver.page_source, "html.parser")

# with open("data\\tfl_data.html", "w") as file:
#     file.write(str(soup))

In [5]:
#open html file of the tfl website

with open('data\\tfl_data.html', 'r') as file:
    html_content = file.read()

In [6]:
#convert to BS4 object for scraping

soup = BeautifulSoup(html_content, "html.parser")

In [7]:
# get table with all files in

table = soup.find(id="tbody-content")

In [8]:
#get all URLs from the table for download

all_links = []

for tr in table.find_all('tr'):
    dl = tr.get('data-level')

    #only get level 3 links
    if dl == '3':
        # get a href tag for download link
        a_links = tr.find_all('a', href=True)
        # only get csv files
        if tr.find_all('td')[3].string == "CSV file":
            # only add link where there is a link that exists
            if len(a_links) > 0:
                all_links.append(a_links[0]['href'])

In [9]:
# get the links that are just for the usage stats that are for years 2019-2021

usage_links_all = []

for l in all_links:
    if l[32:43] == 'usage-stats' and (l[-6:-4] == '19' or l[-6:-4] == '20' or l[-6:-4] == '21'):
        usage_links_all.append(l)

In [10]:
#remove the first url which is data for 2018 to first day of 2019
usage_links_all = usage_links_all[1:]

In [11]:
# This is all commented to reduce run time.
# The below code requested all usage stats data and then saved the data in a csv file


# # to allow for csv reading
# storage_options = {'User-Agent': 'Mozilla/5.0'}

# def merge_csv_data(urls):
#     dfs = []
#     for url in urls:
#         # Read CSV data from URL
#         df = pd.read_csv(url, storage_options=storage_options)
#         # Append dataframe to list
#         dfs.append(df)
    
#     # Concatenate all dataframes in the list into one dataframe
#     merged_df = pd.concat(dfs, ignore_index=True)
    
#     return merged_df

# # List of URLs pointing to CSV files
# urls = usage_links_all

# # Call the function and get the merged dataframe
# merged_dataframe = merge_csv_data(urls)

# merged_dataframe.to_csv('data\\all_data.csv')

# Importing data from saved CSV file

In [12]:
df = pd.read_csv('data\\all_data.csv')

In [13]:
df.shape

(31348502, 10)

- Very large dataset, 31 million rows of data!

In [14]:
df.dtypes

Unnamed: 0            int64
Rental Id             int64
Duration              int64
Bike Id               int64
End Date             object
EndStation Id         int64
EndStation Name      object
Start Date           object
StartStation Id       int64
StartStation Name    object
dtype: object

In [15]:
#drop unnamed:0
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [16]:
# convert start and end dates to DateTime values
df['Start Date'] = pd.to_datetime(df['Start Date'], dayfirst=True)
df['End Date'] = pd.to_datetime(df['End Date'], dayfirst=True)

# Begin Exploratory Analysis

- We need to create some features from our dataset
  - Start and end date columns
  - Time of journey
  - Borough / Area of start and end
  - Weekday / Weekend
  - Day of the week

In [17]:
# First rename columns from Start / End Date to Start / End DateTime
df.rename(columns = {
    'Start Date' : 'Start DateTime',
    'End Date' : 'End DateTime'
}, inplace = True)

In [18]:
# Extract date object from datetime
df['Start Date'] = df['Start DateTime'].dt.date
df['End Date'] = df['End DateTime'].dt.date

In [19]:
# Start and End Time of Day
df['Start Time'] = df['Start DateTime'].dt.time
df['End Time'] = df['End DateTime'].dt.time

In [20]:
# get area as a string

def get_area(input : str) -> str:
    try:
        return input.split(',')[1][1:]
    except IndexError:
        return input

In [21]:
# Get the area values for analysis

input_array_start = df['StartStation Name'].values
input_array_end = df['EndStation Name'].values

#for performance
get_area_vectorized = np.vectorize(get_area)

result_start_areas = get_area_vectorized(input_array_start)
result_end_areas = get_area_vectorized(input_array_start)

result_series_start_areas = pd.Series(result_start_areas)
result_series_end_areas = pd.Series(result_end_areas)

In [22]:
# assign results from vectorized function as features
df['Start Area'] = result_series_start_areas.values
df['End Area'] = result_series_end_areas.values

In [23]:
# Drop unneccesary columns (ID columns)
df.drop(columns=[
    'Rental Id',
    'Bike Id',
    'EndStation Id',
    'StartStation Id'
], inplace=True)

In [24]:
df.head(5)

Unnamed: 0,Duration,End DateTime,EndStation Name,Start DateTime,StartStation Name,Start Date,End Date,Start Time,End Time,Start Area,End Area
0,660,2019-01-02 17:47:00,"Bricklayers Arms, Borough",2019-01-02 17:36:00,"Stamford Street, South Bank",2019-01-02,2019-01-02,17:36:00,17:47:00,South Bank,South Bank
1,180,2019-01-06 18:14:00,"Bricklayers Arms, Borough",2019-01-06 18:11:00,"Empire Square, The Borough",2019-01-06,2019-01-06,18:11:00,18:14:00,The Borough,The Borough
2,960,2019-01-02 14:49:00,"Waterloo Station 1, Waterloo",2019-01-02 14:33:00,"Sedding Street, Sloane Square",2019-01-02,2019-01-02,14:33:00,14:49:00,Sloane Square,Sloane Square
3,120,2019-01-04 12:54:00,"Empire Square, The Borough",2019-01-04 12:52:00,"Bricklayers Arms, Borough",2019-01-04,2019-01-04,12:52:00,12:54:00,Borough,Borough
4,120,2019-01-05 16:03:00,"Empire Square, The Borough",2019-01-05 16:01:00,"Bricklayers Arms, Borough",2019-01-05,2019-01-05,16:01:00,16:03:00,Borough,Borough


In [25]:
# get the day of the week as a string
def get_day_of_week(date : dt.datetime) -> str:
    weekday_int = date.weekday()

    match weekday_int:
        case 0:
            return "Monday", "Weekday"
        case 1:
            return "Tuesday", "Weekday"
        case 2:
            return "Wednesday", "Weekday"
        case 3:
            return "Thursday", "Weekday"
        case 4:
            return "Friday", "Weekday"
        case 5:
            return "Saturday", "Weekend"
        case 6:
            return "Sunday", "Weekend"

In [26]:
# Get the start date values for analysis

input_array_start = df['Start Date'].values

#for performance
get_weekday_vectorized = np.vectorize(get_day_of_week)

result_start_day = get_weekday_vectorized(input_array_start)[0]
result_start_wkday = get_weekday_vectorized(input_array_start)[1]

result_series_start_day = pd.Series(result_start_day)
result_series_start_wkday = pd.Series(result_start_wkday)

# assign results from vectorized function as features
df['Start Day'] = result_series_start_day.values
df['Start Wkday'] = result_series_start_wkday.values

In [27]:
df.head(5)

Unnamed: 0,Duration,End DateTime,EndStation Name,Start DateTime,StartStation Name,Start Date,End Date,Start Time,End Time,Start Area,End Area,Start Day,Start Wkday
0,660,2019-01-02 17:47:00,"Bricklayers Arms, Borough",2019-01-02 17:36:00,"Stamford Street, South Bank",2019-01-02,2019-01-02,17:36:00,17:47:00,South Bank,South Bank,Wednesday,Weekday
1,180,2019-01-06 18:14:00,"Bricklayers Arms, Borough",2019-01-06 18:11:00,"Empire Square, The Borough",2019-01-06,2019-01-06,18:11:00,18:14:00,The Borough,The Borough,Sunday,Weekend
2,960,2019-01-02 14:49:00,"Waterloo Station 1, Waterloo",2019-01-02 14:33:00,"Sedding Street, Sloane Square",2019-01-02,2019-01-02,14:33:00,14:49:00,Sloane Square,Sloane Square,Wednesday,Weekday
3,120,2019-01-04 12:54:00,"Empire Square, The Borough",2019-01-04 12:52:00,"Bricklayers Arms, Borough",2019-01-04,2019-01-04,12:52:00,12:54:00,Borough,Borough,Friday,Weekday
4,120,2019-01-05 16:03:00,"Empire Square, The Borough",2019-01-05 16:01:00,"Bricklayers Arms, Borough",2019-01-05,2019-01-05,16:01:00,16:03:00,Borough,Borough,Saturday,Weekend


In [28]:
def get_time_of_day(time : dt.time) -> str:
    #define Time strings for threshold comparison -> i.e is the time of start in the morning, afternoon, or day?
    morn_time = dt.time(hour=12, minute=0, second=0)
    aft_time = dt.time(hour=17, minute=0, second=0)

    #convert start time string into time string for comparison
    #start_time = dt.datetime.strptime(time, '%H:%M:%S').strftime('%H:%M:%S')

    time_of_day_return = ''

    if time < morn_time:
        time_of_day_return = 'Morning'
    elif time < aft_time:
        time_of_day_return = 'Afternoon'
    else:
        time_of_day_return = 'Evening'
    
    return time_of_day_return, time.hour

In [29]:
# Get the start date values for analysis

input_array_start = df['Start Time'].values

#for performance
get_weekday_vectorized = np.vectorize(get_time_of_day)

result_start_time = get_weekday_vectorized(input_array_start)[0]
result_start_hour = get_weekday_vectorized(input_array_start)[1]

result_series_start_time = pd.Series(result_start_time)
result_series_start_hour = pd.Series(result_start_hour)

# assign results from vectorized function as features
df['Start Time of Day'] = result_series_start_time.values
df['Start Hour'] = result_series_start_hour.values

In [30]:
df.head(5)

Unnamed: 0,Duration,End DateTime,EndStation Name,Start DateTime,StartStation Name,Start Date,End Date,Start Time,End Time,Start Area,End Area,Start Day,Start Wkday,Start Time of Day,Start Hour
0,660,2019-01-02 17:47:00,"Bricklayers Arms, Borough",2019-01-02 17:36:00,"Stamford Street, South Bank",2019-01-02,2019-01-02,17:36:00,17:47:00,South Bank,South Bank,Wednesday,Weekday,Evening,17
1,180,2019-01-06 18:14:00,"Bricklayers Arms, Borough",2019-01-06 18:11:00,"Empire Square, The Borough",2019-01-06,2019-01-06,18:11:00,18:14:00,The Borough,The Borough,Sunday,Weekend,Evening,18
2,960,2019-01-02 14:49:00,"Waterloo Station 1, Waterloo",2019-01-02 14:33:00,"Sedding Street, Sloane Square",2019-01-02,2019-01-02,14:33:00,14:49:00,Sloane Square,Sloane Square,Wednesday,Weekday,Afternoon,14
3,120,2019-01-04 12:54:00,"Empire Square, The Borough",2019-01-04 12:52:00,"Bricklayers Arms, Borough",2019-01-04,2019-01-04,12:52:00,12:54:00,Borough,Borough,Friday,Weekday,Afternoon,12
4,120,2019-01-05 16:03:00,"Empire Square, The Borough",2019-01-05 16:01:00,"Bricklayers Arms, Borough",2019-01-05,2019-01-05,16:01:00,16:03:00,Borough,Borough,Saturday,Weekend,Afternoon,16


It seems like 