# Function to Automate Web Scraping of Historical Yellow Cab Rides in NYC

In [None]:
import requests
from pathlib import Path

def DownloadOneFileRawData(year:int, month:int) -> Path:
    URL = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet"
    response = requests.get(URL)

    if response.status_code == 200:
        path = f"../data/raw/rides_{year}-{month:02d}.parquet"
        open(path, "wb").write(response.content)
        return path
    else:
        raise Exception(f"{URL} is not available")

In [None]:
DownloadOneFileRawData(2022, 1)

In [None]:
import pandas as pd

rides = pd.read_parquet("../data/raw/rides_2022-01.parquet")

rides.head(10)

In [None]:
rides = rides[["tpep_pickup_datetime", "PULocationID"]]

rides.rename(columns={"tpep_pickup_datetime":"PickupDatetime", "PULocationID":"PickupLocation"}, inplace = True)

rides

# Data Validation - Checking that all Pickups are actually from the Month they are suppose to be

In [None]:
rides["PickupDatetime"].describe()

In [None]:
rides = rides[rides["PickupDatetime"] >= "2022-01-01"]
rides = rides[rides["PickupDatetime"] <= "2022-02-01"]

rides["PickupDatetime"].describe()

In [None]:
#Writing down on Disk the Transformed and Validated Data

In [None]:
rides.to_parquet("../data/transformed/rides_2022-01.parquet")

# Data Transformation - Transforming Raw Data into Time-Series Data

In [None]:
import pandas as pd

newrides = pd.read_parquet("../data/transformed/rides_2022-01.parquet")

newrides.head(10)

In [None]:
newrides["PickupHour"] = newrides["PickupDatetime"].dt.floor('H')

newrides

In [None]:
aggrides = newrides.groupby(["PickupHour", "PickupLocation"]).size().reset_index()
aggrides.rename(columns = {0 : "NumOfRides"}, inplace = True)
aggrides

In [None]:
#Checking for 0 Rides Rows and Adding Empty Hours when No Rides Occurred

from tqdm import tqdm

def AddMissingSlots(aggrides:pd.DataFrame) -> pd.DataFrame:
    
    locations = aggrides["PickupLocation"].unique()
    full_range = pd.date_range(aggrides["PickupHour"].min(), aggrides["PickupHour"].max(), freq = "H")
    output = pd.DataFrame()
    
    for locid in tqdm(locations):
    
        #Keep only Rides for this Location ID
        aggrides_i = aggrides.loc[aggrides["PickupLocation"] == locid, ["PickupHour", "NumOfRides"]]
        
        #Adding Missing Dates with 0 in a Series
        aggrides_i.set_index("PickupHour", inplace = True)
        aggrides_i.index = pd.DatetimeIndex(aggrides_i.index)
        aggrides_i = aggrides_i.reindex(full_range, fill_value = 0)
        
        #Add Back Location ID Columns
        aggrides_i["PickupLocationID"] = locid
        
        output = pd.concat([output, aggrides_i])
        
    #Move the PickupHour from Index to Column
    output = output.reset_index().rename(columns = {"index":"PickupHour"})
    
    return output

In [None]:
#Creating New Complete Time-Series DF

AggRides = AddMissingSlots(aggrides)

AggRides

# Plotting Data

In [None]:
from typing import Optional, List
import plotly.express as px

def PlotRides(rides:pd.DataFrame, locations:Optional[List[int]] = None):
    
    rides_to_plot = rides[rides["PickupLocationID"].isin(locations)] if locations else rides
    
    fig = px.line(
        rides_to_plot,
        x = "PickupHour",
        y = "NumOfRides",
        color = "PickupLocationID",
        template = "none",
    )
    
    fig.show()

In [None]:
# The Following Import and Setting is needed to Make the Plotly Figure Showing
import plotly.io as pio
pio.renderers.default = "iframe" # or 'colab' or 'iframe' or 'iframe_connected' or 'sphinx_gallery'

PlotRides(AggRides, locations = [43])

In [None]:
#Saving the Data to Disk

AggRides.to_parquet("../data/transformed/TransformedData_2022-01.parquet")

# Transforming the Time-Series in Training Data (Tabular Data)

In [None]:
#The Features are going to be an X Number of Rows, each one counting as a Feature
#The Target is going to be the Next Immediate Row after the Ones selected as Features
#On Every new Target to Predict we Slide the Rows by 1
#So, the Features on Each iteration are always going to be an X Number of Rows.
#(The Previous Target becomes a Feature when Predicting the Next Target and we Drop the First Row Used)

#EXAMPLE:
#Rolling 12 Features, we Take the first 12 Rows to predict the 13th as a Target
#Features Rows Indexes = 0 - 12
#Target Row Index = 13
#NEXT ITERATION
#We Take the 12 Rows from Index 1 to 13 to predict the 14th as a Target
#Features Rows Indexes = 1 - 13
#Target Row Index = 14

In [None]:
TransformedData = pd.read_parquet("../data/transformed/TransformedData_2022-01.parquet")
TransformedData

In [None]:
#Just Picking a One Location now

TS_OneLocation = TransformedData.loc[TransformedData["PickupLocationID"] == 43, :].reset_index(drop = True)
TS_OneLocation

In [None]:
def GetCutoffIndeces(data:pd.DataFrame, nFeatures:int, SlidingFactor:int) -> list:
    StopPosition = len(data)-1
    
    #Start the First SubSequence at Index Position 0
    SubseqFirstIdx = 0
    SubseqStepIdx = nFeatures
    SubseqLastIdx = nFeatures +1
    
    #[FirstIdx, StepIdx, LastIdx]
    
    Indeces = []
    
    while SubseqLastIdx <= StopPosition:
        Indeces.append([SubseqFirstIdx, SubseqStepIdx, SubseqLastIdx])
        
        #StepSize is used as Sliding Factor
        SubseqFirstIdx += SlidingFactor
        SubseqStepIdx += SlidingFactor
        SubseqLastIdx += SlidingFactor
        
    return Indeces

In [None]:
nFeatures = 24
SlidingFactor = 1

Indeces = GetCutoffIndeces(TS_OneLocation, nFeatures, SlidingFactor)

Indeces[:5]

In [None]:
#Implementing the Slicing using the Generated Indeces

import numpy as np
import warnings
warnings.filterwarnings("ignore") 

nSamples = len(Indeces)
X = np.ndarray(shape = (nSamples, nFeatures), dtype = np.float32)
Y = np.ndarray(shape = (nSamples), dtype = np.float32)

PickupHours = []

for i, idx in enumerate(Indeces):
    X[i,:] = TS_OneLocation.iloc[idx[0]:idx[1]]["NumOfRides"].values
    Y[i] = TS_OneLocation.iloc[idx[1]:idx[2]]["NumOfRides"].values
    PickupHours.append(TS_OneLocation.iloc[idx[1]]["PickupHour"])

In [None]:
#Constructing a Features DataFrame with the Columns name inteded as Number of Rides X Hours Before the Targets' Hour

FeaturesOneLocationDF = pd.DataFrame(X, columns = [f"Rides {i+1} Hours Before" for i in reversed(range(nFeatures))])
FeaturesOneLocationDF

In [None]:
#Constructing a Targets DataFrame with the Columns name inteded as Number of Rides AT the Targets' Hour

TargetsOneLocationDF = pd.DataFrame(Y, columns = ["Target Rides Next Hour"])
TargetsOneLocationDF

In [None]:
#Defining a Function that Does the Trick for all Locations and Not Just One
#(Transforms all Time-Series for Each of ALL Locations into Tabular Format as Above)

def TransformALL(tsData:pd.DataFrame, nFeatures:int, SlidingFactor:int) -> pd.DataFrame:
    
    assert set(tsData.columns) == {"PickupHour", "NumOfRides", "PickupLocationID"}
    
    locationIDs = tsData["PickupLocationID"].unique()
    Features = pd.DataFrame()
    Targets = pd.DataFrame()
    
    for locid in tqdm(locationIDs):
        #Keep only Time-Series Data for this Location
        tsDataOneLocation = tsData.loc[tsData["PickupLocationID"] == locid, ["PickupHour", "NumOfRides"]].sort_values(by = ["PickupHour"])
        
        #Pre-Compute Cutoff Indeces to Split DataFrame Rows
        indeces = GetCutoffIndeces(tsDataOneLocation, nFeatures, SlidingFactor)
        
        #Slice and Transpose Data into NumPy Arrays for Features and Target
        nSamples = len(indeces)
        X = np.ndarray(shape=(nSamples, nFeatures), dtype=np.float32)
        Y = np.ndarray(shape=(nSamples), dtype=np.float32)
        PickupHours = []
        
        for i, idx in enumerate(indeces):
            X[i,:] = tsDataOneLocation.iloc[idx[0]:idx[1]]["NumOfRides"].values
            Y[i] = tsDataOneLocation.iloc[idx[1]:idx[2]]["NumOfRides"].values
            PickupHours.append(tsDataOneLocation.iloc[idx[1]]["PickupHour"])
            
        #NumPy -> Pandas
        FeaturesOneLocationDF = pd.DataFrame(X, columns = [f"Rides {i+1} Hours Before" for i in reversed(range(nFeatures))])
        FeaturesOneLocationDF["PickupHour"] = PickupHours
        FeaturesOneLocationDF["PickupLocationID"] = locid
        
        TargetsOneLocationDF = pd.DataFrame(Y, columns = ["Target Rides Next Hour"])

        #Concatenate Results
        Features = pd.concat([Features, FeaturesOneLocationDF])
        Targets = pd.concat([Targets, TargetsOneLocationDF])
        
    Features.reset_index(inplace = True, drop = True)
    Targets.reset_index(inplace = True, drop = True)
    
    return Features, Targets["Target Rides Next Hour"]

In [None]:
Features, Targets = TransformALL(TransformedData, nFeatures = 24*7*1, SlidingFactor = 24) #One Week of Features

In [None]:
Features