In [5]:
import json
import re
from bs4 import BeautifulSoup
import html

In [6]:
def load_data(filepath):
    """Load JSON data from a file."""
    with open(filepath, 'r') as file:
        return json.load(file)

def save_data(data, filepath):
    """Save data to a JSON file."""
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=4)

def clean_text(text):
    """Clean text by removing HTML tags and HTML entities, then normalizing whitespace."""
    text = BeautifulSoup(text, "lxml").text
    text = html.unescape(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_reference_marks(text):
    """Remove reference marks from text."""
    return re.sub(r'\[\d+\]', '', text)

def filter_text(text):
    """Filter text to remove non-alphanumeric characters and ensure all characters are alphanumeric."""
    text = remove_reference_marks(text)  # Remove reference marks first
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # Replace non-alphanumeric characters with space
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text.lower()

def preprocess_summaries(data):
    """Preprocess summaries by cleaning and filtering text."""
    for topic, articles in data.items():
        for article in articles:
            summary = article['summary']
            summary = clean_text(summary)  # Clean HTML and entities
            summary = filter_text(summary)  # Further filter text
            article['summary'] = summary


In [7]:
def main():
    """Main function to load, clean, and save data."""
    data = load_data('wiki_scraped_articles_final.json')
    preprocess_summaries(data)
    save_data(data, 'cleaned_wiki_articles.json')

In [9]:
if __name__ == "__main__":
    main()

  text = BeautifulSoup(text, "lxml").text


In [10]:
def load_data(filepath):
    with open(filepath, 'r') as file_handle:
        return json.load(file_handle)

def present_documents(data):
    for topic, documents in data.items():
        print(f"\nTopic: {topic}, Number of Documents: {len(documents)}\n")
        for doc in documents[:10]:
            title = doc.get('title', 'Title Unavailable')
            url = doc.get('url', 'URL Unavailable')
            summary = doc.get('summary', 'Summary Unavailable')[:200]
            print(f"Title: {title}\nURL: {url}\nSummary: {summary}\n---")

data_path = 'cleaned_wiki_articles.json'
scraped_info = load_data(data_path)
present_documents(scraped_info)


Topic: Health, Number of Documents: 5559

Title: Telehealth
URL: https://en.wikipedia.org/wiki/Telehealth
Summary: telehealth is the distribution of health related services and information via electronic information and telecommunication technologies it allows long distance patient and clinician contact care advic
---
Title: Meihua Group
URL: https://en.wikipedia.org/wiki/Meihua_Group
Summary: meihua group chinese is a premier global supplier of amino acid nutrition and wellness solutions established in 2002 and headquartered in lhasa tibet autonomous region meihua group finalized its share
---
Title: Health equity
URL: https://en.wikipedia.org/wiki/Health_equity
Summary: health equity arises from access to the social determinants of health specifically from wealth power and prestige individuals who have consistently been deprived of these three determinants are signif
---
Title: Chronic condition
URL: https://en.wikipedia.org/wiki/Chronic_condition
Summary: a chronic condition also k