In [7]:
import os
import requests
import time
from datetime import datetime

def get_file_size_in_mb(file_path):
    """
    Calculate the size of a file.

    Args:
        file_path (str): The path of the file whose size is to be determined.
    
    Returns:
        float: The size of the file in megabytes.
    """
    # Check the size of the file at the given path and return it in megabytes
    return os.path.getsize(file_path) / (1024 * 1024)

def save_page_content(ocr_url, file_path):
    """
    Download page content from an OCR URL and append it to a specified file.

    Args:
        ocr_url (str): The URL from which to fetch the OCR text.
        file_path (str): The file path where the OCR text will be appended.
    """
    with open(file_path, 'a', encoding='utf-8') as file:
        try:
            print(f"Fetching OCR text from {ocr_url}...")
            ocr_text = requests.get(ocr_url).text
            file.write(ocr_text + '\n')
            print("OCR text successfully saved.")
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch OCR text from {ocr_url}: {e}")

def get_ocr_url(page_url):
    """
    Retrieve the OCR URL from a page URL.

    Args:
        page_url (str): The URL of the page from which to fetch the OCR URL.
    
    Returns:
        str: The OCR URL extracted from the page data.
    """
    page_data = requests.get(page_url).json()
    ocr_url = page_data.get('text')
    return ocr_url

def get_list_of_page_url(issue_url):
    """
    Fetch all page URLs from an issue URL and extract the issue's publication year.

    Args:
        issue_url (str): The URL of the issue from which to extract page URLs and publication year.
    
    Returns:
        tuple: A tuple containing the list of page URLs and the issue's publication year.
    """
    issue_data = requests.get(issue_url).json()
    list_of_page_url = issue_data['pages']
    issue_date_year = datetime.strptime(issue_data['date_issued'], '%Y-%m-%d').year
    return list_of_page_url, issue_date_year

def get_list_of_issue_url(batch_url):
    """
    Retrieve all issue URLs within a batch.

    Args:
        batch_url (str): The URL of the batch from which to extract issue URLs.
    
    Returns:
        list: A list of issue URLs.
    """
    batch_data = requests.get(batch_url).json()
    list_of_issue_url = batch_data['issues']
    return list_of_issue_url

def get_most_recent_paginated_directory(main_url):
    """
    Retrieve the most recent directory by navigating through paginated responses from a main URL.

    Args:
        main_url (str): The main URL used to start fetching the most recent directory.
    
    Returns:
        dict: The data of the most recent directory.
    """
    print("Retrieving the most recent paginated directory, please wait... This would take less than 3 mins.")
    main_data = requests.get(main_url).json()
    dir_count = 1
    while 'next' in main_data:
        main_data = requests.get(main_data['next']).json()
        dir_count += 1
    print(f"Retrieved the most recent paginated directory, which is the {dir_count}th directory.")
    return main_data

def safe_json_parse(response):
    """Safely parse the JSON response, returning None if parsing fails."""
    try:
        if response.headers.get('Content-Type') == 'application/json':
            return response.json()
    except ValueError:
        pass  # JSON parsing failed
    return None

def ibms_interview_task(main_url, max_file_size, directory, file_name):
    """
    Executes a task to fetch OCR text from a series of web pages, save it to a file, 
    and halt when a maximum file size is reached.
    """
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except Exception as e:
        print(f"Error creating directory {directory}: {e}")
        return

    file_path = os.path.join(directory, file_name)
    
    try:
        with open(file_path, 'w') as file:
            pass  # The file is created empty
        print(f"File '{file_name}' has been created.")
    except Exception as e:
        print(f"Error creating file {file_path}: {e}")
        return
    
    page_count = 0
    flag = 0
    try:
        main_data = requests.get(main_url).json()
    except Exception as e:
        print(f"Error fetching the main URL {main_url}: {e}")
        return
    
    start_time = time.time()

    try:
        while True:
            if flag == 0:
                batches = main_data['batches']
                flag = 1
            else:
                try:
                    response = requests.get(main_data['previous'])
                    response.raise_for_status()
                    main_data = response.json()
                    batches = main_data['batches']
                except Exception as e:
                    print(f"Error fetching previous batches: {e}")
                    continue

            for batch in batches:
                try:
                    batch_url = batch['url']
                    # Simulated function for demonstration; replace with the actual function
                    list_of_issue_url = get_list_of_issue_url(batch_url)
                except Exception as e:
                    print(f"Error fetching issues from batch {batch_url}: {e}")
                    continue
                
                for issue_url in list_of_issue_url:
                    try:
                        # Simulated function for demonstration; replace with the actual function
                        list_of_page_url, issue_date_year = get_list_of_page_url(issue_url['url'])
                    except Exception as e:
                        print(f"Error fetching pages from issue {issue_url['url']}: {e}")
                        continue
                    
                    for page in list_of_page_url:
                        try:
                            # Simulated function for demonstration; replace with the actual function
                            ocr_url = get_ocr_url(page['url'])
                            time.sleep(1)  # Be polite to the server
                        except Exception as e:
                            print(f"Error fetching OCR URL {page['url']}: {e}")
                            continue
                        
                        try:
                            # Simulated function for demonstration; replace with the actual function
                            file_size = get_file_size_in_mb(file_path)
                        except Exception as e:
                            print(f"Error checking file size for {file_path}: {e}")
                            continue
                        
                        if file_size < max_file_size:
                            try:
                                # Simulated function for demonstration; replace with the actual function
                                save_page_content(ocr_url, file_path)
                                page_count += 1
                            except Exception as e:
                                print(f"Error saving page content from {ocr_url}: {e}")
                                continue
                        else:
                            print(f"Reached file size limit of {max_file_size} MB.")
                            raise StopIteration
    except StopIteration:
        print("Task completed due to file size limit.")
    except Exception as e:
        print(f"Unexpected error: {e}")

    end_time = time.time()
    total_time = end_time - start_time
    print(f"Task completed in {total_time:.2f} seconds. Total pages processed: {page_count}")

    return total_time, page_count

main_url = "https://chroniclingamerica.loc.gov/batches.json"
max_file_size = 50
directory = "D:/2_nlp"
file_name = "extracted_contents.txt"
# Run the task and capture the execution time and page count
total_time, page_count = ibms_interview_task(main_url, max_file_size, directory, file_name)

File 'extracted_contents.txt' has been created.
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1918-02-15/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1918-02-22/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1918-03-22/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1918-04-26/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1918-05-03/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1918-09-13/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lcc

Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1919-10-24/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1919-11-07/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1919-11-14/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1919-11-21/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1919-11-28/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1919-12-05/ed-1.json: Expecting value: line 1 column 1 (char 0)
Error fetching pages from issue https://chroniclingamerica.loc.gov/lccn/sn83045201/1919-12-19/ed-1.json: Expecting val

KeyboardInterrupt: 