Importing necessary libraries

In [16]:
import requests
from bs4 import BeautifulSoup
import json

Defining the functions that are going to be used to retrieve the words and their synonyms

In [17]:

# Function to clean and extract the word
def extract_word(entry):
    parts = entry.split('N.')
    if parts and len(parts) > 1:
        word = parts[0].strip()
        #when word has [] remove them and everything inside
        while '[' in word and ']' in word:
            start = word.find('[')
            end = word.find(']') + 1
            if start < end:
                word = word[:start] + word[end:]
            else:
                break  # In case of mismatched brackets, prevent infinite loop
        word = ''.join(e for e in word if e.isalpha() or e.isspace())
        
        #if word has "  " do it as " "
        if "  " in word:
            word = word.replace("  ", " ")
            
        #if word starts with a space, remove it
        if word[0] == " ":
            word = word[1:]
            
        #if word ends with a space, remove it
        if word[-1] == " ":
            word = word[:-1]
            
        #if word starts with lowercase, capitalize it
        if word[0].islower():
            word = word.capitalize()
        
        # Delete everything before the first Capital letter
        for i in range(len(word)):
            if word[i].isupper():
                word = word[i:]
                break
        if '\r\n' in word:
            word = word.split('\r\n')[1]
        return word
    return None

def extract_synonyms(entry):
    parts = entry.split('N.')
    
    #take after the N. and remove the \r\n
    if parts and len(parts) > 1:
        synonyms = parts[1].split('—')
        if synonyms and len(synonyms) > 1:
            synonyms = synonyms[1].split(',' or ';')
            synonyms = [syn.strip() for syn in synonyms if syn.strip()]
        #remove any "V." or "&c." or number
        for i in range(len(synonyms)):
            if "adj." in synonyms[i]:
                synonyms[i] = synonyms[i].replace("adj.", "")
            if "Adj." in synonyms[i]:
                synonyms[i] = synonyms[i].replace("Adj.", "")
            if "V." in synonyms[i]:
                synonyms[i] = synonyms[i].replace("V.", "")
            if "&c." in synonyms[i]:
                synonyms[i] = synonyms[i].replace("&c.", "")
            if "v." in synonyms[i]:
                synonyms[i] = synonyms[i].replace("v.", "")
            #if synonym has a number remove it
            if any(char.isdigit() for char in synonyms[i]):
                synonyms[i] = ''.join(e for e in synonyms[i] if not e.isdigit())
            
        #when synonym has [] remove them and everything inside
        for i in range(len(synonyms)):
            while '[' in synonyms[i] and ']' in synonyms[i]:
                start = synonyms[i].find('[')
                end = synonyms[i].find(']') + 1
                if start < end:
                    synonyms[i] = synonyms[i][:start] + synonyms[i][end:]
                else:
                    break
                
        #when synonym has {} remove them and everything inside
        for i in range(len(synonyms)):
            while '{' in synonyms[i] and '}' in synonyms[i]:
                start = synonyms[i].find('{')
                end = synonyms[i].find('}') + 1
                if start < end:
                    synonyms[i] = synonyms[i][:start] + synonyms[i][end:]
                else:
                    break
        
        #if there is "\r\n" replace it with a " "
        for i in range(len(synonyms)):
            if '\r\n' in synonyms[i]:
                synonyms[i] = synonyms[i].replace('\r\n', ' ')
                              
    return synonyms

Initializing all the necessities like BeautifulSoup, url path etc

In [18]:
# URL of Roget's Thesaurus Classification
url = 'https://www.gutenberg.org/cache/epub/22/pg22-images.html'

# Download page content
response = requests.get(url)
response.raise_for_status()  # This will raise an error if the download failed

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

thesaurus = {}
current_h2, current_h3, current_h4, current_h5 = None, None, None, None
start_processing = False  # Flag to start processing

Itterating for between each class, subclass, category, subcategory and finding the words and their synonyms

In [19]:
for tag in soup.find_all(['h2', 'h3', 'h4', 'h5', 'p']):
    if tag.name == 'h2':
        current_h2 = tag.get_text(strip=True).split('\n')[0]
        # Check if the current class is "CLASS IWORDS EXPRESSING ABSTRACT RELATIONS" or later
        if "CLASS IWORDS EXPRESSING ABSTRACT RELATIONS" in current_h2:
            start_processing = True
        if not start_processing:
            continue  # Skip all processing until the flag is True
        thesaurus[current_h2] = {}
        current_h3 = current_h4 = current_h5 = None
    elif start_processing:  # Only process other tags if start_processing is True
        if tag.name == 'h3':
            current_h3 = tag.get_text(strip=True)
            thesaurus[current_h2][current_h3] = {}
            current_h4 = current_h5 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            thesaurus[current_h2][current_h3][current_h4] = {}
            current_h5 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            thesaurus[current_h2][current_h3][current_h4][current_h5] = {}
        elif tag.name == 'p' and 'p2' in tag.get('class', []):
            word = extract_word(tag.text)
            if word:
                if current_h5 is not None:
                    thesaurus[current_h2][current_h3][current_h4][current_h5][word] = extract_synonyms(tag.text)
                elif current_h4 is not None:
                    thesaurus[current_h2][current_h3][current_h4][word] = extract_synonyms(tag.text)
                elif current_h3 is not None:
                    thesaurus[current_h2][current_h3][word] = extract_synonyms(tag.text)
                elif current_h2 is not None:
                    thesaurus[current_h2][word] = extract_synonyms(tag.text)

Saving the dictionary into a json format file

In [20]:
# Save the hierarchy to a JSON file
json_filename = 'rogets_thesaurus_hierarchy.json'
with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(thesaurus, json_file, ensure_ascii=False, indent=4)

print(f"Thesaurus hierarchy saved to {json_filename}")

Thesaurus hierarchy saved to rogets_thesaurus_hierarchy.json
