In [1]:
import math
import bs4
import re
import os
import requests
import datetime
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [2]:
TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
UBER_CSV = "uber_rides_sample.csv"
WEATHER_DIR = 'weather_files'
TAXI_DIR = 'taxi_files'

# (latitude,longitude) 
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
EARTH_REDIUS = 6378.137

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [11]:
# Formula
def rad(d):
    return d * math.pi / 180.0


# Calculate the distance through coordinates
def calculate_distance(from_coord, to_coord):
    lat1, lng1 = from_coord
    lat2, lng2 = to_coord
    rad_lat1 = rad(lat1)
    rad_lat2 = rad(lat2)
    a = rad_lat1 - rad_lat2
    b = rad(lng1) - rad(lng2)
    s = 2 * math.asin(math.sqrt(math.pow(math.sin(a/2), 2) + math.cos(rad_lat1) * math.cos(rad_lat2) *
                                math.pow(math.sin(b/2), 2)))
    s = s * EARTH_REDIUS
    return s
    

# Add the coordinate columns to the dataframe
def add_distance_column(dataframe):
    dataframe['trip_distance'] = dataframe.apply(
        lambda x: calculate_distance(
            (x.loc['pickup_latitude'], x.loc['pickup_longitude']),
            (x.loc['dropoff_latitude'], x.loc['dropoff_longitude'])
        ), axis=1
    )

In [14]:
### Part 1: Data Preprocessing

In [15]:
# Get the download links of yellow taxi from 2009-01 to 2015-06
def find_taxi_csv_urls():
    response = requests.get(TAXI_URL)
    soup = BeautifulSoup(response.text, "html.parser")
    url_tags = soup.find_all("a", title="Yellow Taxi Trip Records")
    all_csv_urls = []
    pattern = "2009|2010|2011|2012|2013|2014|2015-0[1-6]"

    for url_tag in url_tags:
        if re.findall(pattern, url_tag["href"]):
            all_csv_urls.append(url_tag["href"])

    return all_csv_urls