In [9]:
import requests
from bs4 import BeautifulSoup
import csv
from tqdm import tqdm

# Fetch the webpage content
url = "https://www.sans.org/security-resources/glossary-of-terms/"
response = requests.get(url)

# Parse the webpage content
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <p> tags
p_tags = soup.find_all('p')

# Prepare a list to store the terms and definitions
terms_and_definitions = []

# Temporary variable to store the current term
current_term = None
current_definition = []

# Loop through each <p> tag and extract the terms and definitions
for p in tqdm(p_tags, desc="Processing rows"):
    # Check if there's a <strong> tag (the term)
    strong_tag = p.find('strong')
    
    if strong_tag:
        # If we were already processing a term, save it
        if current_term and current_definition:
            terms_and_definitions.append([current_term, ' '.join(current_definition).strip()])
        
        # Extract the new term and reset the definition
        current_term = strong_tag.get_text(strip=True)
        current_definition = []
    
    # If we're in a term block, add text to the definition
    if current_term:
        # Add the text content of the current <p> tag to the definition
        p_text = p.get_text(strip=True)
        
        # Skip the term text that is already handled by the strong tag
        if p_text != current_term:
            current_definition.append(p_text)

# After the loop, save the last term and definition if present
if current_term and current_definition:
    terms_and_definitions.append([current_term, ' '.join(current_definition).strip()])

# Write the extracted terms and definitions to a CSV file
with open('sans_glossary_terms.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Term', 'Definition'])
    for term, definition in terms_and_definitions:
        writer.writerow([term, definition])

print(f"Extracted {len(terms_and_definitions)} terms and definitions and saved them to 'sans_glossary_terms.csv'.")


Processing rows: 100%|████████████████████████████████████████████████████████████| 484/484 [00:00<00:00, 28307.09it/s]

Extracted 459 terms and definitions and saved them to 'sans_glossary_terms.csv'.



