In [1]:
'''
pip install requests
pip install beautifulsoup4
pip install lxml
pip install tqdm
'''
import requests
from bs4 import BeautifulSoup
from lxml import html
import os
from urllib.parse import urljoin
import re
from tqdm import tqdm 

In [2]:
# web page URL
url = 'https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page'
# Make a GET request to fetch the content of the page
response = requests.get(url)

In [3]:
# folder where the downloaded files will be saved
destination_directory = 'page_data/'
# Specify the time intervals for which the files will be searched
search_values = ['2023-06', '2022-06']

In [4]:
#Check if the request was completed successfully
if response.status_code == 200:
    # Parse the content of the page with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <a> elements with the href attribute
    links = soup.find_all('a', href=True)
  

    # Create the destination folder if it doesn't exist
    os.makedirs(destination_directory, exist_ok=True)

    # Iterate through the links and download the files that meet the criteria
    for link in links:
        url_relative = link['href']
        if url_relative.startswith(('http', 'https')):
            # URL absolut
            url_absolute = url_relative
        else:
            # Construct an absolute URL
            url_absolute = urljoin(url, url_relative)
        # Build the dynamic regular expression
        regular_expression = '|'.join(map(re.escape, search_values))
        # Check if the absolute URL contains the desired dates
        if re.search(regular_expression, url_absolute):
            # Extract the file name from the URL
            file_name = url_relative.split('/')[-1]

            # Download the file with a progress bar
            response = requests.get(url_absolute, stream=True)

            if response.status_code == 200:
                save_path = os.path.join(destination_directory, file_name)
                total_size = int(response.headers.get('content-length', 0))
                with tqdm(total=total_size, unit='B', unit_scale=True, unit_divisor=1024) as pbar:
                    with open(save_path, 'wb') as file:
                        for data in response.iter_content(chunk_size=1024):
                            pbar.update(len(data))
                            file.write(data)
                print("Downloaded file:", file_name)
            else:
                print("The file could not be downloaded:", file_name)

else:
    print("Page access error:", response.status_code)

100%|██████████| 52.5M/52.5M [00:12<00:00, 4.26MB/s]


Downloaded file: yellow_tripdata_2023-06.parquet


100%|██████████| 1.49M/1.49M [00:01<00:00, 927kB/s] 


Downloaded file: green_tripdata_2023-06.parquet


100%|██████████| 13.4M/13.4M [00:05<00:00, 2.76MB/s]


Downloaded file: fhv_tripdata_2023-06.parquet


100%|██████████| 476M/476M [01:53<00:00, 4.41MB/s] 


Downloaded file: fhvhv_tripdata_2023-06.parquet


100%|██████████| 52.8M/52.8M [00:12<00:00, 4.46MB/s]  


Downloaded file: yellow_tripdata_2022-06.parquet


100%|██████████| 1.46M/1.46M [00:01<00:00, 883kB/s] 


Downloaded file: green_tripdata_2022-06.parquet


100%|██████████| 11.7M/11.7M [00:03<00:00, 3.44MB/s]


Downloaded file: fhv_tripdata_2022-06.parquet


100%|██████████| 437M/437M [02:21<00:00, 3.23MB/s] 

Downloaded file: fhvhv_tripdata_2022-06.parquet



