In [1]:
import json
import re
from bs4 import BeautifulSoup
import html

In [2]:
def load_data(filepath):
    """Load JSON data from a file."""
    with open(filepath, 'r') as file:
        return json.load(file)

def save_data(data, filepath):
    """Save data to a JSON file."""
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=4)

def clean_text(text):
    """Clean text by removing HTML tags and HTML entities, then normalizing whitespace."""
    text = BeautifulSoup(text, "lxml").text
    text = html.unescape(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_reference_marks(text):
    """Remove reference marks from text."""
    return re.sub(r'\[\d+\]', '', text)

def filter_text(text):
    """Filter text to remove non-alphanumeric characters and ensure all characters are alphanumeric."""
    text = remove_reference_marks(text)  # Remove reference marks first
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # Replace non-alphanumeric characters with space
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text.lower()

def preprocess_summaries(data):
    """Preprocess summaries by cleaning and filtering text."""
    for topic, articles in data.items():
        for article in articles:
            summary = article['summary']
            summary = clean_text(summary)  # Clean HTML and entities
            summary = filter_text(summary)  # Further filter text
            article['summary'] = summary


In [3]:
def main():
    """Main function to load, clean, and save data."""
    data = load_data('wiki_scraped_articles_final.json')
    preprocess_summaries(data)
    save_data(data, 'cleaned_wiki_articles.json')

In [4]:
if __name__ == "__main__":
    main()

  text = BeautifulSoup(text, "lxml").text


In [5]:
def load_data(filepath):
    with open(filepath, 'r') as file_handle:
        return json.load(file_handle)

def present_documents(data):
    for topic, documents in data.items():
        print(f"\nTopic: {topic}, Number of Documents: {len(documents)}\n")
        for doc in documents[:10]:
            title = doc.get('title', 'Title Unavailable')
            url = doc.get('url', 'URL Unavailable')
            summary = doc.get('summary', 'Summary Unavailable')[:200]
            print(f"Title: {title}\nURL: {url}\nSummary: {summary}\n---")

data_path = 'cleaned_wiki_articles.json'
scraped_info = load_data(data_path)
present_documents(scraped_info)


Topic: Health, Number of Documents: 6099

Title: Health insurance
URL: https://en.wikipedia.org/wiki/Health_insurance
Summary: health insurance or medical insurance also known as medical aid in south africa is a type of insurance that covers the whole or a part of the risk of a person incurring medical expenses as with other 
---
Title: Health technology
URL: https://en.wikipedia.org/wiki/Health_technology
Summary: health technology is defined by the world health organization as the application of organized knowledge and skills in the form of devices medicines vaccines procedures and systems developed to solve a
---
Title: Sexual and reproductive health
URL: https://en.wikipedia.org/wiki/Sexual_and_reproductive_health
Summary: sexual and reproductive health srh is a field of research health care and social activism that explores the health of an individual s reproductive system and sexual well being during all stages of the
---
Title: Biotechnology
URL: https://en.wikipedia.org/wiki/B