## CricSheetWebScraapper

In [2]:
!pip install selenium



In [9]:
import os
import requests
from zipfile import ZipFile
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

# Get current notebook directory
NOTEBOOK_DIR = Path().resolve()

# Launch Selenium browser
def launch_browser():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in background
    driver = webdriver.Chrome(options=options)
    return driver

# Scrape zip links using Selenium for json
def get_match_links_selenium(url="https://cricsheet.org/matches/"):
    driver = launch_browser()
    driver.get(url)

    links = driver.find_elements(By.TAG_NAME, "a")
    match_links = {"test": None, "odi": None, "t20": None}

    for link in links:
        href = link.get_attribute("href")
        if href and href.endswith("json.zip"):
            for match_type in match_links:
                if match_type in href.lower():
                    match_links[match_type] = href

    driver.quit()
    return match_links

# Download zip file
def download_zip(url, save_dir):
    filename = url.split("/")[-1]
    zip_path = save_dir / filename

    print(f"Downloading {filename}...")
    response = requests.get(url)
    with open(zip_path, "wb") as f:
        f.write(response.content)

    return zip_path

# Extract zip file
def extract_zip(zip_path, extract_to):
    print(f"Extracting {zip_path.name}...")
    with ZipFile(zip_path, "r") as zip_ref:
        folder_name = extract_to / zip_path.stem
        folder_name.mkdir(exist_ok=True)
        zip_ref.extractall(folder_name)

# delete the zip files
def delete_zip_files(folder_path):
    zip_files = list(folder_path.glob("*.zip"))
    if not zip_files:
        print("No zip files found to delete.")
        return

    for zip_file in zip_files:
        try:
            zip_file.unlink()
            print(f"Deleted: {zip_file.name}")
        except Exception as e:
            print(f"Error deleting {zip_file.name}: {e}")

# Main orchestrator
def main():
    zipDir = NOTEBOOK_DIR/"data"
    match_links = get_match_links_selenium()
    for match_type, url in match_links.items():
        if url:
            zip_path = download_zip(url, zipDir)
            extract_zip(zip_path, zipDir)
        else:
            print(f"No zip found for {match_type}")

    delete_zip_files(zipDir)

# Run the workflow
main()

Downloading tests_json.zip...
Extracting tests_json.zip...
Downloading odis_json.zip...
Extracting odis_json.zip...
Downloading it20s_json.zip...
Extracting it20s_json.zip...
Deleted: it20s_json.zip
Deleted: odis_json.zip
Deleted: tests_json.zip
