In [18]:
import requests
from bs4 import BeautifulSoup
import os
import bibtexparser
from glob import glob

def extract_text_from_url(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all div elements with class 'html-p'
        div_elements = soup.find_all('div', class_='html-p')
        
        # Initialize an empty string to store the extracted text
        extracted_text = ""
        
        # Iterate over div elements and extract text from each
        for div in div_elements:
            # Extract text from the div element and append to the extracted_text
            extracted_text += div.get_text(separator='\n') + '\n'
        
        return extracted_text.strip()
    else:
        print("Failed to fetch URL:", url)
        return None

def save_text_to_file(text, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)


In [19]:
def extract_and_save_text_from_bib(bib_file):
    # Load the BibTeX file
    with open(bib_file, 'r', encoding='utf-8') as bibfile:
        bib_database = bibtexparser.load(bibfile)
    
    # Iterate over entries in the BibTeX file
    for entry in bib_database.entries:
        # Check if the entry has a URL
        if 'url' in entry:
            url = entry['url']
            # Extract text from the URL
            text = extract_text_from_url(url)
            if text:
                # Save text to file
                filename = os.path.splitext(bib_file)[0] + "_texts"
                if not os.path.exists(filename):
                    os.makedirs(filename)
                filename = os.path.join(filename, f"{entry['ID']}.txt")  # Using entry ID as filename
                save_text_to_file(text, filename)
                print(f"Text extracted and saved successfully for {entry['ID']}!")
            else:
                print(f"Failed to extract text for {entry['ID']}.")

In [20]:
# Search for all .bib files in the current directory
bib_files = glob("*.bib")

# Extract and save text from each .bib file
for bib_file in bib_files:
    extract_and_save_text_from_bib(bib_file)

Text extracted and saved successfully for app11073019!
Text extracted and saved successfully for app11114813!
Text extracted and saved successfully for app11114825!
Text extracted and saved successfully for app11156844!
Text extracted and saved successfully for app11167262!
Text extracted and saved successfully for app11167338!
Text extracted and saved successfully for app11167375!
Text extracted and saved successfully for app11167509!
Text extracted and saved successfully for app11167559!
Text extracted and saved successfully for app11167671!
Text extracted and saved successfully for app11178243!
Text extracted and saved successfully for app11188292!
Text extracted and saved successfully for app11219822!
Text extracted and saved successfully for app112110318!
Text extracted and saved successfully for app112110331!
Text extracted and saved successfully for app112210630!
Text extracted and saved successfully for app112210684!
Text extracted and saved successfully for app112210826!
Text 