In [None]:
import os
import requests
import re

# Base URL structure
BASE_URL = "https://www.govinfo.gov/content/pkg/USCODE-2023-title15/pdf/USCODE-2023-title15-chap"
DOWNLOAD_DIR = "/content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha"

# Range of chapters to download
START_CHAPTER = 1
END_CHAPTER = 122

def download_pdfs():
    # Create directory for downloads if it doesn't exist
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)

    for chapter in range(START_CHAPTER, END_CHAPTER + 1):
        # Download the main chapter file
        if download_file(chapter):
            # If the main chapter exists, check for subparts
            check_and_download_subparts(chapter)

def check_and_download_subparts(chapter):
    """
    Check for and download subparts for a given chapter.
    """
    subpart_suffix = 'A'  # Start checking from 'A'
    while True:
        chapter_url = f"{BASE_URL}{chapter}{subpart_suffix}.pdf"
        pdf_name = f"USCODE-2023-title15-chap{chapter}{subpart_suffix}.pdf"
        pdf_path = os.path.join(DOWNLOAD_DIR, pdf_name)

        try:
            response = requests.head(chapter_url)
            if response.status_code == 200:
                # If the subpart exists, download it
                print(f"Found subpart: {pdf_name}. Downloading...")
                download_file(chapter, subpart=subpart_suffix)
                # Move to the next potential subpart
                subpart_suffix = chr(ord(subpart_suffix) + 1)
            else:
                # Break loop when no more subparts are found
                print(f"No more subparts found for Chapter {chapter}.")
                break
        except Exception as e:
            print(f"Error checking subpart {chapter}{subpart_suffix}: {e}")
            break

def download_file(chapter, subpart=None):
    """
    Download a single chapter or subpart PDF.
    """
    if subpart:
        chapter_url = f"{BASE_URL}{chapter}{subpart}.pdf"
        pdf_name = f"USCODE-2023-title15-chap{chapter}{subpart}.pdf"
    else:
        chapter_url = f"{BASE_URL}{chapter}.pdf"
        pdf_name = f"USCODE-2023-title15-chap{chapter}.pdf"

    pdf_path = os.path.join(DOWNLOAD_DIR, pdf_name)

    try:
        print(f"Attempting to download: {pdf_name}...")
        response = requests.get(chapter_url, stream=True)
        if response.status_code == 200:
            # Save the PDF
            with open(pdf_path, "wb") as pdf_file:
                for chunk in response.iter_content(chunk_size=1024):
                    pdf_file.write(chunk)
            print(f"Saved: {pdf_path}")
            return True  # File downloaded successfully
        else:
            print(f"Not found: {pdf_name} (Status Code: {response.status_code})")
            return False
    except Exception as e:
        print(f"Error downloading {pdf_name}: {e}")
        return False

if __name__ == "__main__":
    print("Starting to download US Code Title 15 PDFs...")
    download_pdfs()
    print("All downloads completed.")


Starting to download US Code Title 15 PDFs...
Attempting to download: USCODE-2023-title15-chap1.pdf...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha/USCODE-2023-title15-chap1.pdf
No more subparts found for Chapter 1.
Attempting to download: USCODE-2023-title15-chap2.pdf...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha/USCODE-2023-title15-chap2.pdf
Found subpart: USCODE-2023-title15-chap2A.pdf. Downloading...
Attempting to download: USCODE-2023-title15-chap2A.pdf...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha/USCODE-2023-title15-chap2A.pdf
Found subpart: USCODE-2023-title15-chap2B.pdf. Downloading...
Attempting to download: USCODE-2023-title15-chap2B.pdf...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha/USCODE-2023-title15-chap2B.pdf
Found subpart: USCODE-2023-title15-chap2C.pdf. Downloading...
Attempting to download: USCODE-2023-title1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import requests

# Base URL structure
BASE_URL = "https://www.govinfo.gov/content/pkg/USCODE-2023-title"
DOWNLOAD_DIR = "/content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata"

# Total number of titles
TOTAL_TITLES = 54
CHAPTER_RANGE = 1, 150  # Assume titles may have up to 150 chapters

def download_pdfs():
    """
    Download all PDFs for all titles, including chapters and subparts.
    """
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)

    for title_number in range(1, TOTAL_TITLES + 1):
        title = str(title_number)
        print(f"Processing Title {title}...")
        title_dir = os.path.join(DOWNLOAD_DIR, f"Title_{title}")
        if not os.path.exists(title_dir):
            os.makedirs(title_dir)

        for chapter in range(CHAPTER_RANGE[0], CHAPTER_RANGE[1] + 1):
            if verify_and_download_file(title, chapter, title_dir):
                check_and_download_subparts(title, chapter, title_dir)

def check_and_download_subparts(title, chapter, title_dir):
    """
    Check for and download subparts for a given title and chapter.
    """
    subpart_suffix = 'A'
    while True:
        chapter_url = f"{BASE_URL}{title}/pdf/USCODE-2023-title{title}-chap{chapter}{subpart_suffix}.pdf"
        if not url_exists(chapter_url):
            print(f"No more subparts for Title {title} Chapter {chapter}.")
            break

        pdf_name = f"USCODE-2023-title{title}-chap{chapter}{subpart_suffix}.pdf"
        pdf_path = os.path.join(title_dir, pdf_name)

        print(f"Found subpart: {pdf_name}. Downloading...")
        download_file(chapter_url, pdf_path)
        subpart_suffix = chr(ord(subpart_suffix) + 1)

def verify_and_download_file(title, chapter, title_dir):
    """
    Verify if a chapter exists and download it if available.
    """
    chapter_url = f"{BASE_URL}{title}/pdf/USCODE-2023-title{title}-chap{chapter}.pdf"
    if not url_exists(chapter_url):
        return False

    pdf_name = f"USCODE-2023-title{title}-chap{chapter}.pdf"
    pdf_path = os.path.join(title_dir, pdf_name)
    print(f"Found: {pdf_name}. Downloading...")
    download_file(chapter_url, pdf_path)
    return True

def url_exists(url):
    """
    Check if a URL exists using HEAD request.
    """
    try:
        response = requests.head(url)
        return response.status_code == 200
    except Exception as e:
        print(f"Error checking URL {url}: {e}")
        return False

def download_file(url, path):
    """
    Download a file from a given URL.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Saved: {path}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

if __name__ == "__main__":
    print("Starting to download all US Code Title PDFs...")
    download_pdfs()
    print("All downloads completed.")


Starting to download all US Code Title PDFs...
Processing Title 1...
Found: USCODE-2023-title1-chap1.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata/Title_1/USCODE-2023-title1-chap1.pdf
No more subparts for Title 1 Chapter 1.
Found: USCODE-2023-title1-chap2.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata/Title_1/USCODE-2023-title1-chap2.pdf
No more subparts for Title 1 Chapter 2.
Found: USCODE-2023-title1-chap3.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata/Title_1/USCODE-2023-title1-chap3.pdf
No more subparts for Title 1 Chapter 3.
Processing Title 2...
Found: USCODE-2023-title2-chap1.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata/Title_2/USCODE-2023-title2-chap1.pdf
No more subparts for Title 2 Chapter 1.
Found: USCODE-2023-title2

In [None]:
import os
import requests

# Base URL structure
BASE_URL = "https://www.govinfo.gov/content/pkg/USCODE-2023-title"
DOWNLOAD_DIR = "/content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata"

# Total number of titles
TOTAL_TITLES = 54
CHAPTER_RANGE = 1, 150  # Assume titles may have up to 150 chapters

def download_pdfs():
    """
    Download all PDFs for all titles, including chapters and subparts.
    """
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)

    for title_number in range(1, TOTAL_TITLES + 1):
        title = str(title_number)
        print(f"Processing Title {title}...")
        title_dir = os.path.join(DOWNLOAD_DIR, f"Title_{title}")
        if not os.path.exists(title_dir):
            os.makedirs(title_dir)

        for chapter in range(CHAPTER_RANGE[0], CHAPTER_RANGE[1] + 1):
            if verify_and_download_file(title, chapter, title_dir):
                check_and_download_subparts(title, chapter, title_dir)

def check_and_download_subparts(title, chapter, title_dir):
    """
    Check for and download subparts for a given title and chapter.
    """
    subpart_suffix = 'A'
    while True:
        chapter_url = f"{BASE_URL}{title}/pdf/USCODE-2023-title{title}-chap{chapter}{subpart_suffix}.pdf"
        if not url_exists(chapter_url):
            print(f"No more subparts for Title {title} Chapter {chapter}.")
            break

        pdf_name = f"USCODE-2023-title{title}-chap{chapter}{subpart_suffix}.pdf"
        pdf_path = os.path.join(title_dir, pdf_name)

        print(f"Found subpart: {pdf_name}. Downloading...")
        download_file(chapter_url, pdf_path)
        subpart_suffix = chr(ord(subpart_suffix) + 1)

def verify_and_download_file(title, chapter, title_dir):
    """
    Verify if a chapter exists and download it if available.
    """
    chapter_url = f"{BASE_URL}{title}/pdf/USCODE-2023-title{title}-chap{chapter}.pdf"
    if not url_exists(chapter_url):
        return False

    pdf_name = f"USCODE-2023-title{title}-chap{chapter}.pdf"
    pdf_path = os.path.join(title_dir, pdf_name)
    print(f"Found: {pdf_name}. Downloading...")
    download_file(chapter_url, pdf_path)
    return True

def url_exists(url):
    """
    Check if a URL exists using HEAD request.
    """
    try:
        response = requests.head(url)
        return response.status_code == 200
    except Exception as e:
        print(f"Error checking URL {url}: {e}")
        return False

def download_file(url, path):
    """
    Download a file from a given URL.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Saved: {path}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

if __name__ == "__main__":
    print("Starting to download all US Code Title PDFs...")
    download_pdfs()
    print("All downloads completed.")


In [3]:
import os
import requests

# Base URL structure with year placeholder
BASE_URL_TEMPLATE = "https://www.govinfo.gov/content/pkg/USCODE-{year}-title"

# Define download years
YEARS = range(2015, 2023)

# Total number of titles
TOTAL_TITLES = 54
CHAPTER_RANGE = 1, 150  # Assume titles may have up to 150 chapters

# Base directory for downloads
BASE_DOWNLOAD_DIR = "/content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata"

def download_pdfs_for_year(year):
    """
    Download all PDFs for a given year.
    """
    print(f"Starting downloads for the year {year}...")
    year_dir = os.path.join(BASE_DOWNLOAD_DIR, str(year))
    if not os.path.exists(year_dir):
        os.makedirs(year_dir)

    for title_number in range(1, TOTAL_TITLES + 1):
        title = str(title_number)
        print(f"Processing Title {title} for the year {year}...")
        title_dir = os.path.join(year_dir, f"Title_{title}")
        if not os.path.exists(title_dir):
            os.makedirs(title_dir)

        for chapter in range(CHAPTER_RANGE[0], CHAPTER_RANGE[1] + 1):
            if verify_and_download_file(year, title, chapter, title_dir):
                check_and_download_subparts(year, title, chapter, title_dir)

def check_and_download_subparts(year, title, chapter, title_dir):
    """
    Check for and download subparts for a given year, title, and chapter.
    """
    subpart_suffix = 'A'
    while True:
        chapter_url = f"{BASE_URL_TEMPLATE.format(year=year)}{title}/pdf/USCODE-{year}-title{title}-chap{chapter}{subpart_suffix}.pdf"
        if not url_exists(chapter_url):
            print(f"No more subparts for Title {title} Chapter {chapter} in year {year}.")
            break

        pdf_name = f"USCODE-{year}-title{title}-chap{chapter}{subpart_suffix}.pdf"
        pdf_path = os.path.join(title_dir, pdf_name)

        print(f"Found subpart: {pdf_name}. Downloading...")
        download_file(chapter_url, pdf_path)
        subpart_suffix = chr(ord(subpart_suffix) + 1)

def verify_and_download_file(year, title, chapter, title_dir):
    """
    Verify if a chapter exists and download it if available.
    """
    chapter_url = f"{BASE_URL_TEMPLATE.format(year=year)}{title}/pdf/USCODE-{year}-title{title}-chap{chapter}.pdf"
    if not url_exists(chapter_url):
        return False

    pdf_name = f"USCODE-{year}-title{title}-chap{chapter}.pdf"
    pdf_path = os.path.join(title_dir, pdf_name)
    print(f"Found: {pdf_name}. Downloading...")
    download_file(chapter_url, pdf_path)
    return True

def url_exists(url):
    """
    Check if a URL exists using HEAD request.
    """
    try:
        response = requests.head(url)
        return response.status_code == 200
    except Exception as e:
        print(f"Error checking URL {url}: {e}")
        return False

def download_file(url, path):
    """
    Download a file from a given URL.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Saved: {path}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

if __name__ == "__main__":
    print("Starting to download US Code Title PDFs for multiple years...")
    for year in YEARS:
        download_pdfs_for_year(year)
    print("All downloads completed.")


Starting to download US Code Title PDFs for multiple years...
Starting downloads for the year 2015...
Processing Title 1 for the year 2015...
Found: USCODE-2015-title1-chap1.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata/2015/Title_1/USCODE-2015-title1-chap1.pdf
No more subparts for Title 1 Chapter 1 in year 2015.
Found: USCODE-2015-title1-chap2.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata/2015/Title_1/USCODE-2015-title1-chap2.pdf
No more subparts for Title 1 Chapter 2 in year 2015.
Found: USCODE-2015-title1-chap3.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final/Data/Raw/Final Data Repo/Namratha_alldata/2015/Title_1/USCODE-2015-title1-chap3.pdf
No more subparts for Title 1 Chapter 3 in year 2015.
Processing Title 2 for the year 2015...
Found: USCODE-2015-title2-chap1.pdf. Downloading...
Saved: /content/drive/Shareddrives/DATA298B_Final

KeyboardInterrupt: 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
