In [1]:
import aiohttp
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import os


In [2]:
nest_asyncio.apply()

API_BASE_URL = "https://en.wikipedia.org/w/api.php"
ARTICLES_PER_TOPIC = 5500
MAX_DEPTH = 6
CHECKPOINT_FILE = 'wiki_scraping_checkpoint.json'
ARTICLES_BEFORE_CHECKPOINT = 1000


In [3]:
# Topics and corresponding subtopics
topics_dictionary = {
    "Health": [
        "Common diseases", "global health statistics", "mental health trends",
        "preventive healthcare", "vaccination campaigns", "healthcare systems",
        "nutrition and wellness", "epidemiology", "medical research", "public health policies",
        "pandemic responses", "mental health therapies", "telemedicine", "healthcare inequalities",
        "aging population health", "childhood obesity", "alternative medicine", "health insurance policies",
        "disease eradication programs", "public health emergencies"
        "Public Health", "Epidemics", "Pandemics", "Health Policy",
        "Global Health", "Mental Health", "Women's Health", "Men's Health",
        "Child Health", "Adolescent Health", "Geriatric Care", "Preventative Care",
        "Health Insurance", "Medical Ethics", "Health Technology", "Telehealth",
        "Health Research", "Clinical Trials", "Vaccine Development", "Drug Development",
        "Medical Imaging", "Surgical Innovations", "Health Education", "Nursing",
        "Dentistry", "Alternative Medicine", "Fitness", "Nutrition",
        "Diseases Prevention", "Chronic Conditions", "Infectious Diseases", "Cancer Research",
        "Cardiology", "Neurology", "Dermatology", "Endocrinology", "Gastroenterology",
        "Psychiatry", "Sports Medicine", "Occupational Health", "Environmental Health"
    ],
    "Environment": [
        "Global warming", "endangered species", "deforestation rates",
        "recycling initiatives", "conservation efforts", "climate change impacts",
        "sustainable practices", "environmental legislation", "green technologies", "biodiversity",
        "ocean pollution", "air quality indices", "urban sustainability", "environmental activism",
        "clean energy technologies", "wildlife conservation", "sustainable agriculture", "water resource management",
        "coral reef protection", "environmental impact assessments"
        "Climate Action", "Sustainable Living", "Renewable Energy", "Solar Power",
        "Wind Energy", "Eco-Friendly Products", "Green Building", "Sustainable Agriculture",
        "Wildlife Conservation", "Marine Conservation", "Pollution Control", "Waste Management",
        "Recycling Programs", "Urban Green Spaces", "National Parks", "Conservation Projects",
        "Environmental Justice", "Air Quality", "Water Quality", "Environmental Impact",
        "Natural Resources", "Ecosystem Services", "Land Management", "Deforestation",
        "Desertification", "Biodiversity Loss", "Climate Adaptation", "Environmental Education",
        "Green Policy", "Environmental Advocacy", "Eco-Tourism", "Sustainable Transport",
        "Environmental Law", "Green Technology", "Carbon Footprint", "Sustainability Goals",
        "Climate Research", "Oceanography", "Green Investing", "Environmental NGOs"
    ],
    "Technology": [
        "Information Technology", "Cloud Computing", "Data Science", "Machine Learning",
        "Artificial Intelligence", "Cybersecurity", "Blockchain", "Cryptocurrency",
        "Fintech", "Biotech", "Medtech", "Robotic Process Automation",
        "Internet of Things (IoT)", "Smart Homes", "Wearable Devices", "Mobile Technology",
        "5G Technology", "Quantum Computing", "Augmented Reality", "Virtual Reality",
        "Gaming Technology", "Software Development", "App Development", "UI/UX Design",
        "Web Development", "Digital Transformation", "Tech Startups", "Silicon Valley",
        "Tech Regulation", "Privacy Technology", "Tech Ethics", "Open Source",
        "E-commerce", "Social Media", "Streaming Technology", "Tech Investments",
        "Tech Patents", "Tech Education", "Technology Trends", "Future Technologies"
    ],
    "Economy": [
        "Global Economy", "Economic Growth", "Financial Markets", "Investment Banking",
        "Consumer Behavior", "Economic Crises", "Public Finance", "Corporate Finance",
        "Economic Theories", "Supply Chain", "Financial Regulation", "Market Analysis",
        "Economic Indicators", "Fiscal Policy", "Monetary Policy", "Economic Sectors",
        "Emerging Markets", "International Trade", "Economic Sanctions", "Business Cycles",
        "Economic Forecasting", "Financial Instruments", "Economic History", "Economic Models",
        "Labor Market", "Real Estate", "Inflation Dynamics", "Venture Capital", "Stock Exchanges",
        "Debt Markets", "Forex Markets", "Financial Technologies", "Economic Policy", "Energy Economics",
        "Environmental Economics", "Health Economics", "Technology and Economy", "Trade Wars", "Manufacturing Sector"
        "Inflation", "Recession", "Forex", "Tariffs",
        "Imports", "Exports", "Taxation", "Subsidies",
        "GDP", "Retail", "Commerce", "Trade",
        "Investing", "Stocks", "Bonds", "Cryptocurrencies",
        "Budgeting", "Savings", "Loans", "Venture Capital",
        "Bankruptcy", "Pensions", "Wealth", "Poverty",
        "Fiscal Policies", "Monetary Policies", "Economic Growth", "Economic Decline",
        "Market Trends", "Economic Crises", "Financial Markets", "Commodities"
    ],
    "Entertainment": [
        "Film Production", "Music Production", "Theater Production", "Art Exhibitions",
        "Entertainment Law", "Media Studies", "Celebrity Culture", "New Media",
        "Art History", "Video Production", "Festival Management", "Music Festivals",
        "Film Festivals", "Broadcast Media", "Digital Entertainment", "Literature",
        "Comedy", "Opera", "Ballet", "Circus",
        "Magic Shows", "Concert Tours", "Graphic Novels", "Animation Studios",
        "Radio Broadcasting", "Photography", "Documentary Film", "Reality Television",
        "Satellite Television", "Cable Television", "Streaming Services", "Video Gaming Industry",
        "Publishing Industry", "News Media", "Celebrity News", "Artistic Performance", "Visual Arts",
        "Dance Industry", "Entertainment Marketing", "Cultural Industry"
        "Cinema", "Theatre", "Concerts", "Albums",
        "Streaming", "Television", "Documentaries", "Sitcoms",
        "Reality Shows", "Talk Shows", "Awards", "Galleries",
        "Festivals", "Nightlife", "Clubs", "Dance",
        "Acting", "Directing", "Producing", "Screenwriting",
        "Playwriting", "Animations", "Comics", "Manga",
        "Video Games", "Esports", "Streaming Platforms", "Music Tours",
        "Celebrity Gossip", "Fashion Shows", "Media Events"
    ],
    "Sports": [
        "Major sporting events", "sports analytics", "Olympic games",
        "World Cup soccer", "NBA highlights", "MLB seasons", "tennis grand slams",
        "golf championships", "athletics records", "extreme sports", "sports medicine",
        "fitness trends", "e-sports developments", "sports law",
        "youth sports development", "sports psychology", "paralympic competitions", "sports marketing",
        "international sports federations", "sports anti-doping measures",
        "Football", "Basketball", "Baseball", "Soccer",
        "Cricket", "Rugby", "Tennis", "Golf",
        "Swimming", "Track", "Field", "Boxing",
        "MMA", "Cycling", "Skiing", "Snowboarding",
        "Skateboarding", "Surfing", "Diving", "Archery",
        "Biathlon", "Bowling", "Cheerleading", "Curling",
        "Darts", "Fencing", "Gymnastics", "Handball",
        "Hockey", "Lacrosse", "Rowing", "Sailing"
    ],
    "Politics": [
        "Political Science", "Constitutional Law", "Political Philosophy", "Social Justice",
        "Human Rights Law", "International Organizations", "Political History", "Political Ethics",
        "Political Communication", "Political Economy", "Electoral Systems", "Political Ideologies",
        "Public Administration", "Federal Government", "State Government", "Local Government",
        "Political Activism", "Foreign Affairs", "Defense Policy", "Immigration Policy",
        "Environmental Policy", "Education Policy", "Healthcare Policy", "Welfare Policy",
        "Urban Policy", "Rural Development", "Public Sector", "Political Consulting", "Geopolitics",
        "Global Governance", "Civil Society", "Public Opinion", "Political Campaigns", "Voting Behavior",
        "Electoral Reforms", "Diplomatic Missions", "Policy Analysis", "Public Law", "Regulatory Frameworks"
        "Democracy", "Dictatorship", "Federal", "State",
        "Local", "Elections", "Senates", "Parliaments",
        "Constitutions", "Courts", "Laws", "Bills",
        "Rights", "Liberties", "Policies", "Campaigns",
        "Debates", "Diplomacy", "Sanctions", "Treaties",
        "Unions", "NGOs", "Activism", "Protests",
        "Rallies", "Voting", "Governance", "Regulations",
        "Legislations", "Amendments", "Lobbying", "Corruption"
    ],
    "Education": [
        "Primary Education", "Secondary Education", "Higher Education", "Adult Education",
        "Vocational Training", "Online Learning", "E-Learning", "Blended Learning",
        "Educational Technology", "Curriculum Development", "Pedagogy", "Instructional Design",
        "Special Education", "Gifted Education", "Language Learning", "Literacy",
        "Numeracy", "STEM Education", "Arts Education", "Civic Education",
        "Teacher Training", "Teaching Methods", "Classroom Management", "Education Policy",
        "Education Reform", "International Baccalaureate", "Student Loans", "Scholarships",
        "School Safety", "Bullying Prevention", "Extracurricular Activities", "School Counseling",
        "Education Standards", "Learning Disabilities", "Test Preparation", "Academic Competitions",
        "Study Skills", "Career Counseling", "Life Skills", "Cultural Competence in Education"
    ],
    "Travel": [
        "Top tourist destinations", "airline industry data", "travel trends",
        "luxury travel", "budget travel", "cultural tourism", "eco-tourism",
        "space travel", "travel safety", "travel insurance", "hotel industry",
        "destination reviews", "travel guides", "festival tourism",
        "adventure travel", "culinary tourism", "heritage tourism", "sustainable travel",
        "digital nomad lifestyle", "tourism marketing",
        "Adventure Travel", "Budget Travel", "Luxury Travel", "Cultural Tourism",
        "Heritage Tourism", "Wildlife Tourism", "Sports Tourism", "Medical Tourism",
        "Wellness Tourism", "Festival Tourism", "Culinary Tourism", "Wine Tourism",
        "Eco-Tourism", "Sustainable Travel", "Volunteer Travel", "Educational Tourism",
        "Space Tourism", "Historical Sites", "World Heritage Sites", "Tourist Attractions",
        "Travel Safety", "Travel Insurance", "Air Travel", "Cruise Travel",
        "Train Travel", "Road Trips", "Solo Travel", "Group Travel",
        "Family Travel", "Business Travel", "Global Destinations", "Travel Guides",
        "Travel Tips", "Travel Gear", "Travel Planning", "Travel Apps",
        "Hospitality Industry", "Tour Operators", "Travel Agencies", "Travel Trends"
    ],
    "Food": [
        "Food Preparation", "Food Preservation", "Food and Culture", "Food Justice",
        "Food Security", "Food Policy", "Food Service", "Food Retail",
        "Organic Products", "Food Labeling", "Farm-to-Table", "Culinary Techniques",
        "Food Criticism", "Food Education", "Food Festivals", "Food Marketing",
        "Food Import", "Food Export", "Agricultural Practices", "Food and Nutrition",
        "Gourmet Food", "Ethnic Foods", "Dietary Supplements", "Functional Foods",
        "Food Innovations", "Food Safety Standards", "Food Packaging", "Food Logistics",
        "Food Industry Trends", "Nutrition Science", "Culinary Schools", "Food Photography",
        "Food Writing", "Food Blogs", "Diet Trends", "Meal Planning", "Cooking Shows",
        "Food Processing", "Beverage Industry", "Wine and Spirits",
        "Beverages", "Cereals", "Dairy", "Fruits",
        "Vegetables", "Meats", "Seafood", "Confectionery",
        "Snacks", "Desserts", "Breads", "Pastas",
        "Chefs", "Recipes", "Diets", "Nutrition",
        "Vitamins", "Proteins", "Fats", "Carbohydrates",
        "Organics", "Veganism", "Gluten-Free", "Keto",
        "Paleo", "Fast Food", "Fine Dining", "Casual Dining",
        "Food Chains", "Local Eats", "Street Food", "Food Security"
    ]
}


In [4]:
topic_keywords = {
    "Health": [
        "health", "disease", "medical", "treatment", "vaccine", "epidemiology",
        "healthcare", "nutrition", "pandemic", "therapy", "telemedicine",
        "aging", "obesity", "insurance", "mental", "preventive"
    ],
    "Environment": [
        "climate", "biodiversity", "conservation", "recycling", "sustainable",
        "pollution", "ecology", "green", "energy", "wildlife", "ocean",
        "forest", "deforestation", "sustainability", "environmental"
    ],
    "Technology": [
        "technology", "software", "data", "AI", "cybersecurity", "blockchain",
        "robotics", "internet", "cloud", "quantum", "virtual", "gaming",
        "digital", "fintech", "biotech", "innovation", "computing"
    ],
    "Economy": [
        "economy", "finance", "market", "banking", "investment", "trade",
        "fiscal", "monetary", "inflation", "stock", "debt", "forex",
        "economic", "corporate", "financial", "growth"
    ],
    "Entertainment": [
        "film", "music", "theater", "media", "celebrity", "festival",
        "broadcast", "digital", "production", "entertainment", "streaming",
        "literature", "art", "dance", "opera", "cinema"
    ],
    "Sports": [
        "sport", "olympic", "soccer", "basketball", "baseball", "tennis",
        "golf", "athletics", "e-sports", "fitness", "championship", "league",
        "tournament", "athlete", "sports"
    ],
    "Politics": [
        "political", "constitution", "democracy", "election", "policy",
        "government", "law", "rights", "legislation", "campaign", "senate",
        "parliament", "diplomacy", "federal", "state", "local"
    ],
    "Education": [
        "education", "curriculum", "learning", "school", "college",
        "university", "academic", "teacher", "student", "teaching",
        "pedagogy", "scholarship", "vocational", "online learning", "e-learning"
    ],
    "Travel": [
        "travel", "tourism", "destination", "airline", "hotel", "guide",
        "festival", "cultural", "heritage", "eco-tourism", "sustainable travel",
        "adventure", "luxury", "budget", "safety", "insurance"
    ],
    "Food": [
        "food", "culinary", "nutrition", "organic", "diet", "cooking",
        "gourmet", "restaurant", "agriculture", "farming", "safety",
        "packaging", "beverage", "wine", "dining", "eating"
    ]
}


In [5]:
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as file:
            data = json.load(file)
            return data['articles'], set(data['urls'])
    return {}, set()

def save_checkpoint(documented_articles, unique_article_urls):
    checkpoint_data = {'articles': documented_articles, 'urls': list(unique_article_urls)}
    with open(CHECKPOINT_FILE, 'w') as file:
        json.dump(checkpoint_data, file, indent=4)

async def retrieve_articles(session, search_topic, continue_params=None):
    search_params = {
        "action": "query",
        "list": "search",
        "srsearch": search_topic,
        "format": "json",
        "srlimit": 200
    }
    if continue_params:
        search_params.update(continue_params)

    async with session.get(API_BASE_URL, params=search_params) as response:
        response_data = await response.json()
        search_continue = response_data.get('continue')
        articles = response_data["query"]["search"] if 'query' in response_data else []
        return articles, search_continue

async def retrieve_revision_id(session, article_title):
    revision_params = {
        "action": "query",
        "prop": "revisions",
        "titles": article_title,
        "rvprop": "ids",
        "format": "json"
    }
    async with session.get(API_BASE_URL, params=revision_params) as response:
        response_data = await response.json()
        page_data = next(iter(response_data["query"]["pages"].values()))
        return page_data["revisions"][0]["revid"]

async def extract_article_summary(session, url):
    async with session.get(url) as response:
        page_content = BeautifulSoup(await response.text(), 'html.parser')
        return ' '.join(p.text for p in page_content.select('p'))

def is_relevant(summary, topic):
    keywords = topic_keywords[topic]
    relevant_count = sum(1 for keyword in keywords if keyword in summary.lower())
    return relevant_count >= 3  # You can adjust this threshold based on your needs

async def explore_deep_links(session, url, depth, visited_urls, articles_accumulator, documented_articles, unique_article_urls, main_topic):
    if depth > MAX_DEPTH or len(articles_accumulator) >= ARTICLES_PER_TOPIC:
        return
    async with session.get(url) as response:
        soup = BeautifulSoup(await response.text(), 'html.parser')
        for link in soup.find_all('a', href=True):
            link_url = urljoin(url, link['href'])
            if link_url not in visited_urls and '/wiki/' in urlparse(link_url).path and not any(x in link_url for x in [':', '#', 'Main_Page']):
                visited_urls.add(link_url)
                summary = await extract_article_summary(session, link_url)
                if summary and is_relevant(summary, main_topic):
                    article_title = link_url.split('/')[-1].replace('_', ' ')
                    revision_id = await retrieve_revision_id(session, article_title)
                    articles_accumulator.append({
                        'revision_id': revision_id,
                        'title': article_title,
                        'summary': summary,
                        'url': link_url,
                        'topic': main_topic
                    })
                    if len(articles_accumulator) % ARTICLES_BEFORE_CHECKPOINT == 0:
                        save_checkpoint(documented_articles, unique_article_urls)
                    if len(articles_accumulator) >= ARTICLES_PER_TOPIC:
                        break
                await explore_deep_links(session, link_url, depth + 1, visited_urls, articles_accumulator, documented_articles, unique_article_urls, main_topic)

async def collect_articles_for_subtopic(session, main_topic, subtopic, articles_accumulator, unique_article_urls, documented_articles):
    continue_params = None
    while True:
        articles_found, continue_params = await retrieve_articles(session, subtopic, continue_params)
        for article in articles_found:
            article_url = f"https://en.wikipedia.org/wiki/{article['title'].replace(' ', '_')}"
            if article_url in unique_article_urls or len(articles_accumulator) >= ARTICLES_PER_TOPIC:
                continue
            unique_article_urls.add(article_url)
            summary = await extract_article_summary(session, article_url)
            if summary and is_relevant(summary, main_topic):
                revision_id = await retrieve_revision_id(session, article['title'])
                articles_accumulator.append({
                    'revision_id': revision_id,
                    'title': article['title'],
                    'summary': summary,
                    'url': article_url,
                    'topic': main_topic
                })
                if len(articles_accumulator) % ARTICLES_BEFORE_CHECKPOINT == 0:
                    save_checkpoint(documented_articles, unique_article_urls)
        if not continue_params or len(articles_accumulator) >= ARTICLES_PER_TOPIC:
            break

async def execute_scraping():
    async with aiohttp.ClientSession() as session:
        documented_articles, unique_article_urls = load_checkpoint()
        collection_tasks = []
        for topic, subtopics in topics_dictionary.items():
            documented_articles.setdefault(topic, [])
            for subtopic in subtopics:
                task = asyncio.create_task(
                    collect_articles_for_subtopic(
                        session, topic, subtopic, documented_articles[topic], unique_article_urls, documented_articles
                    )
                )
                collection_tasks.append(task)
                if len(documented_articles[topic]) >= ARTICLES_PER_TOPIC:
                    break
        await asyncio.gather(*collection_tasks)
        save_checkpoint(documented_articles, unique_article_urls)

        with open('wiki_scraped_articles_final.json', 'w') as file:
            json.dump(documented_articles, file, indent=4)

In [6]:
if __name__ == "__main__":
    asyncio.run(execute_scraping())

In [7]:
def load_data(filepath):
    with open(filepath, 'r') as file_handle:
        return json.load(file_handle)

def present_documents(data):
    for topic, documents in data.items():
        print(f"\nTopic: {topic}, Number of Documents: {len(documents)}\n")
        for doc in documents[:10]:
            title = doc.get('title', 'Title Unavailable')
            url = doc.get('url', 'URL Unavailable')
            summary = doc.get('summary', 'Summary Unavailable')[:200]
            print(f"Title: {title}\nURL: {url}\nSummary: {summary}\n---")

data_path = 'wiki_scraped_articles_final.json'
scraped_info = load_data(data_path)
present_documents(scraped_info)


Topic: Health, Number of Documents: 5545

Title: Medical research
URL: https://en.wikipedia.org/wiki/Medical_research
Summary: Medical research (or biomedical research), also known as health research, refers to the process of using scientific methods with the aim to produce knowledge about human diseases, the prevention and t
---
Title: Global health
URL: https://en.wikipedia.org/wiki/Global_health
Summary: 
 Global health is the health of populations in a worldwide context;[1] it has been defined as "the area of study, research, and practice that places a priority on improving health and achieving equit
---
Title: Community mental health service
URL: https://en.wikipedia.org/wiki/Community_mental_health_service
Summary: 
 Community mental health services (CMHS), also known as community mental health teams (CMHT) in the United Kingdom, support or treat people with mental disorders (mental illness or mental health diff
---
Title: Preventive healthcare
URL: https://en.wikipedia.org/wiki