In [1]:
import re
import csv
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

URL = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

# 1) Télécharger la page
headers = {"User-Agent": "Mozilla/5.0 (LinkScraper/1.0)"}
html = requests.get(URL, headers=headers, timeout=30).text

# 2) Parser
soup = BeautifulSoup(html, "lxml")

# 3) Récupérer UNIQUEMENT les liens .parquet
parquet_links = []
for a in soup.select("a[href]"):
    href = urljoin(URL, a["href"])
    low = href.lower()
    if "trip-data" in low and low.endswith(".parquet"):
        parquet_links.append(href)

# 4) Déduire service / année / mois (optionnel mais pratique)
pat = re.compile(r"(yellow|green|fhv|fhvhv)_tripdata_(\d{4})-(\d{2})\.parquet$", re.I)
rows = []
for url in sorted(set(parquet_links)):
    fname = url.split("/")[-1]
    m = pat.search(fname)
    service, year, month = (m.group(1).lower(), int(m.group(2)), int(m.group(3))) if m else (None, None, None)
    rows.append({"service": service, "year": year, "month": month, "url": url})

# 5) Export CSV
rows.sort(key=lambda r: (r["service"] or "", r["year"] or 0, r["month"] or 0, r["url"]))
with open("nyc_tlc_parquet_links.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["service", "year", "month", "url"])
    w.writeheader()
    w.writerows(rows)

print(f"{len(rows)} liens .parquet trouvés")


508 liens .parquet trouvés
