In [1]:
import requests
import csv
import os
import IPython
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import xml.etree.ElementTree as ET

In [2]:
# List of sitemap URLs
sitemap_urls = [
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles.xml",
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles-1.xml",
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles-2.xml",
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles-3.xml",
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles-4.xml",
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles-5.xml",
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles-6.xml",
    "https://www.encyclopedia.com/sites/default/files/sitemaps/sitemap-articles-7.xml"
]

In [3]:
# Function to extract URLs from XML response
def extract_urls_from_xml(xml_content):
    urls = []
    root = ET.fromstring(xml_content)
    for child in root:
        for sub_child in child:
            if sub_child.tag.endswith("loc"):
                urls.append(sub_child.text)
    return urls

In [4]:
# Specify export path
export_path = "C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Encyclopedia_Urls.csv"

In [None]:
# Create a CSV file
csv_header = ["URL"]
with open(export_path, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(csv_header)

    # Loop through sitemap URLs
    for sitemap_url in sitemap_urls:
        response = requests.get(sitemap_url)
        if response.status_code == 200:
            urls = extract_urls_from_xml(response.content)
            print("URLs found in", sitemap_url, ":")
            for url in urls:
                print(url)
                writer.writerow([url])
            print()
        else:
            print("Error:", response.status_code, "for", sitemap_url)

In [None]:
print("URLs exported to", export_path)

In [7]:
# Specify export paths
sitemap_urls_csv = export_path
text_output_folder = "C:/Users/jjbor/Documents/Machine Learning/Training_Data"

In [8]:
# Read the CSV file with exported URLs
with open(sitemap_urls_csv, "r") as file:
    csv_reader = csv.reader(file)
    # Skip header row
    next(csv_reader)

    # Initialize the increment counter
    # Use a mutable object to hold the count variable
    count = [0]  
    # Create a ThreadPoolExecutor with max_workers set to 10
    with ThreadPoolExecutor(max_workers=5) as executor:

        def process_url(row):
            url = row[0]

            try:
                # Send a GET request to the URL
                response = requests.get(url)
                # Raise an exception for failed requests
                response.raise_for_status()  

                html_content = response.content

                # Extract plain text from the HTML element
                soup = BeautifulSoup(html_content, "html.parser")
                div_element = soup.find("div", class_="doccontentwrapper collapse show")
                if div_element:
                    extracted_text = div_element.get_text()

                    # Print the extracted text
                    print("Extracted text from", url, ":")
                    print(extracted_text)
                    print()

                    # Break up the text if longer than 1024 characters with a 96-character overlap
                    max_length = 1024
                    overlap = 96
                    num_chunks = (len(extracted_text) - max_length) // (max_length - overlap) + 1

                    for i in range(num_chunks):
                        start_idx = i * (max_length - overlap)
                        end_idx = start_idx + max_length

                        chunk = extracted_text[start_idx:end_idx]

                        # Export the chunk to a text file with incrementing numbering
                        output_filename = str(count[0]) + f"_{i}.txt"
                        output_path = os.path.join(text_output_folder, output_filename)
                        with open(output_path, "w") as output_file:
                            output_file.write(chunk)

                        print("Chunk exported to", output_path)

                    # Increment the counter
                    count[0] += 1
                else:
                    print("No matching HTML element found in", url)
            except Exception as e:
                # Handle request/extraction failures silently
                print("Error processing", url + ":", str(e))

            IPython.display.clear_output(wait=True)

        # Specify if URLs should be limited and by how much
        limit_urls, max_urls = False, 10

        # Submit URL processing tasks to the ThreadPoolExecutor
        for i, row in enumerate(csv_reader):
            if limit_urls and i >= max_urls:
                break
            executor.submit(process_url, row)

Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_43.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_44.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_45.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_46.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_47.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_48.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_49.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_50.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\376388_51.txt
Chunk exported to C:/Users/jjbor/Documents/GitHub/ThinkTAI/ThinkTAI-GPT2/Training_Data\3763

In [9]:
print("Extraction and export completed.")

Extraction and export completed.
