## Project Setup

In [2]:
# all import statements needed for the project, for example:


## Part 1: Data Preprocessing

In [None]:
import os 
import requests
import re 
import time 
import urllib.request
from bs4 import BeautifulSoup

In [None]:
def url_date_ge(a: str, b: str) -> bool:
  """Return a boolean that tells whether the input date a is greater than or equal to date b"""
  return (int(a[:4]), int(a[-2:])) >= (int(b[:4]), int(b[-2:]))

def url_date_le(a: str, b: str) -> bool:
  """Return a boolean that tells whether the input date a is less than or equal to date b"""
  return (int(a[:4]), int(a[-2:])) <= (int(b[:4]), int(b[-2:]))  

In [None]:
# List out url that will be using
uber_url = 'https://drive.google.com/file/d/1F7D82w1D5151GXCR6BTEk7mNQ_YnPNDk/view?usp=sharing'
weather_url = 'https://drive.google.com/drive/folders/1I_Cj3RFHRGcQjb5Gas06buqRbKodIwKC?usp=sharing'
taxi_url = 'https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page'

In [None]:
def download_yellow_taxi_parquet(taxi_url: str, 
                                 from_date: str = '2009-01', to_date: str = '2015-06',
                                 taxi_par_save_dir: str = 'yellow_taxi',
                                 max_waiting_time: float = 10 * 60)  -> None:
  """Download the yellow taxi parquet files from taxi_url."""
  # Make a request to the webpage
  response = requests.get(taxi_url)
  # Use BeautifulSoup to parse the HTML content of the webpage
  soup = BeautifulSoup(response.content, 'html.parser')
  data_urls = [tag['href'] for tag in soup.find_all(attrs={'title': 'Yellow Taxi Trip Records'})]
  inrange_urls = [_url for _url in data_urls 
                if url_date_ge(re.search(r'(\d+-\d+)', _url).group(0), from_date) 
                and url_date_le(re.search(r'(\d+-\d+)', _url).group(0), to_date)
                ]
  
  # Create folder
  if not os.path.exists(taxi_par_save_dir):
    os.mkdir(taxi_par_save_dir)

  done = False
  st = time.time()
  while not done and time.time() - st < max_waiting_time:
    try:
      for _url in inrange_urls:
        save_file = os.path.join(taxi_par_save_dir, _url[_url.index('yellow'):])
        if not os.path.exists(save_file):
          print(f'downloading to {save_file}...')
          urllib.request.urlretrieve(_url, save_file)
          print('finished.')
        else:
          print(f'{save_file} already downloaded.')
      
      done = True
      print('All downloads finished.')
    except:
      continue 
  
  if not done:
    print('Not all files downloaded. Might be insufficient max waiting time. You may re-run this function.')

In [None]:
# Execute function to download yellow taxi parquet files
download_yellow_taxi_parquet(taxi_url)

In [None]:
def download_unzip_shapefile(taxi_url: str, shapefile_save_dir: str = 'assets') -> None:
  """Download and unzip the Taxi Zone Shapefile from taxi_url."""
  # Make a request to the webpage
  response = requests.get(taxi_url)
  # Use BeautifulSoup to parse the HTML content of the webpage
  soup = BeautifulSoup(response.content, 'html.parser')
  shapefile_url = soup.find(string='Taxi Zone Shapefile').parent['href']

  if not os.path.exists(shapefile_save_dir):
    os.mkdir(shapefile_save_dir)
  save_file = os.path.join(shapefile_save_dir, 'taxi_zones.zip')
  if not os.path.exists(save_file):
    urllib.request.urlretrieve(shapefile_url, save_file)
  if not os.path.exists(os.path.join('.', 'taxi_zones.shp')):
    !unzip {save_file} 
  print('Downloaded shapefile.')

In [None]:
# Execute the function to download and unzip the Taxi Zone Shapefile from taxi_url
download_unzip_shapefile(taxi_url)