In [1]:
import requests
import json
import os
import time
from tqdm import tqdm

# Define the save directory
SAVE_DIR = os.path.join("Data", "Wikipedia_data")
os.makedirs(SAVE_DIR, exist_ok=True)  # Ensure directory exists

# List of restaurant-related Wikipedia articles to retrieve
TOPIC_TITLES = [
    "Restaurant",
    "History of restaurants",
    "Food trends",
    "Fusion cuisine",
    "Molecular gastronomy",
    "Nouvelle cuisine",
    "Vegetarianism",
    "Veganism",
    "Gluten-free diet",
    "Ketogenic diet",
    "Paleo diet",
    "Restaurant rating",
    "Food critic",
    "Michelin Guide",
    "Zagat Survey",
    "James Beard Award",
    "World's 50 Best Restaurants",
    "Gault Millau",
    "Food delivery",
    "Uber Eats",
    "Ghost kitchen",
    "Automated restaurant",
    "Farm-to-table",
    "Zero-waste movement",
    "Sustainable Restaurant Association",
    "Slow Food",
    "Social media marketing",
    "Food photography",
    "Influencer marketing",
    "TikTok food trends"
]

# Wikipedia API endpoint
WIKI_API_URL = "https://en.wikipedia.org/w/api.php"

# Function to fetch article details from Wikipedia API
def fetch_wikipedia_articles(titles):
    articles = []
    
    for title in tqdm(titles, desc="Fetching Wikipedia Articles"):
        params = {
            "action": "query",
            "format": "json",
            "prop": "extracts|info",
            "exintro": True,
            "explaintext": True,
            "inprop": "url",
            "titles": title
        }

        try:
            response = requests.get(WIKI_API_URL, params=params)
            data = response.json()

            if "query" in data and "pages" in data["query"]:
                for page_id, page in data["query"]["pages"].items():
                    if int(page_id) > 0:  # Valid Wikipedia page
                        articles.append({
                            "title": page.get("title", ""),
                            "url": page.get("fullurl", ""),
                            "summary": page.get("extract", "")
                        })

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {title}: {e}")

        # Sleep to avoid API rate limiting
        time.sleep(1)

    return articles

# Function to save articles as JSON in the specified directory
def save_to_json(data, filename="wikipedia_restaurant_knowledge.json"):
    file_path = os.path.join(SAVE_DIR, filename)
    
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

    print(f"Saved {len(data)} articles to {file_path}")

# Run script
if __name__ == "__main__":
    wikipedia_articles = fetch_wikipedia_articles(TOPIC_TITLES)
    save_to_json(wikipedia_articles)


Fetching Wikipedia Articles: 100%|██████████| 30/30 [00:35<00:00,  1.17s/it]

Saved 29 articles to Data/Wikipedia_data/wikipedia_restaurant_knowledge.json





In [2]:
import requests
import json
import os
import time
from tqdm import tqdm

# Define the save directory
SAVE_DIR = os.path.join("Data", "Wikipedia_data2")
os.makedirs(SAVE_DIR, exist_ok=True)  # Ensure directory exists

# Wikipedia API endpoint
WIKI_API_URL = "https://en.wikipedia.org/w/api.php"

# Seed categories to retrieve multiple articles
TOPIC_CATEGORIES = [
    "Category:Restaurants",
    "Category:Restaurant chains",
    "Category:Food and drink awards",
    "Category:Food trends",
    "Category:Restaurant terminology",
    "Category:History of food and drink",
    "Category:Sustainable food system",
    "Category:Food delivery",
    "Category:Michelin Guide",
    "Category:Food critics",
    "Category:Culinary trends",
    "Category:Celebrity chefs",
    "Category:Restaurant guides",
    "Category:Restaurant review websites",
    "Category:Fast food"
]

# Function to get articles from a Wikipedia category
def get_articles_from_category(category, max_articles=50):
    """Fetch articles from a specific Wikipedia category."""
    articles = []
    params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": category,
        "cmlimit": max_articles
    }

    try:
        response = requests.get(WIKI_API_URL, params=params)
        data = response.json()
        
        if "query" in data and "categorymembers" in data["query"]:
            for item in data["query"]["categorymembers"]:
                if "Category:" not in item["title"]:  # Exclude subcategories
                    articles.append(item["title"])

    except requests.exceptions.RequestException as e:
        print(f"Error fetching category {category}: {e}")

    time.sleep(1)  # Respect API rate limits
    return articles

# Function to fetch article details from Wikipedia API
def fetch_wikipedia_articles(titles):
    """Retrieve Wikipedia summaries for a list of article titles."""
    articles = []
    
    for title in tqdm(titles, desc="Fetching Wikipedia Articles"):
        params = {
            "action": "query",
            "format": "json",
            "prop": "extracts|info",
            "exintro": True,
            "explaintext": True,
            "inprop": "url",
            "titles": title
        }

        try:
            response = requests.get(WIKI_API_URL, params=params)
            data = response.json()

            if "query" in data and "pages" in data["query"]:
                for page_id, page in data["query"]["pages"].items():
                    if int(page_id) > 0:  # Valid Wikipedia page
                        articles.append({
                            "title": page.get("title", ""),
                            "url": page.get("fullurl", ""),
                            "summary": page.get("extract", "")
                        })

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {title}: {e}")

        time.sleep(1)  # Respect API rate limits

    return articles

# Function to save articles as JSON in the specified directory
def save_to_json(data, filename="wikipedia_restaurant_knowledge.json"):
    file_path = os.path.join(SAVE_DIR, filename)
    
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

    print(f"Saved {len(data)} articles to {file_path}")

# Run script
if __name__ == "__main__":
    # Collect articles from multiple categories
    all_article_titles = set()  # Use a set to avoid duplicates

    for category in tqdm(TOPIC_CATEGORIES, desc="Fetching Categories"):
        articles = get_articles_from_category(category, max_articles=100)
        all_article_titles.update(articles)

    print(f"Total articles collected: {len(all_article_titles)}")

    # Fetch Wikipedia summaries
    wikipedia_articles = fetch_wikipedia_articles(list(all_article_titles))

    # Save to JSON
    save_to_json(wikipedia_articles)


Fetching Categories: 100%|██████████| 15/15 [00:17<00:00,  1.15s/it]


Total articles collected: 366


Fetching Wikipedia Articles: 100%|██████████| 366/366 [07:04<00:00,  1.16s/it]

Saved 366 articles to Data/Wikipedia_data2/wikipedia_restaurant_knowledge.json



