# Scraping Component
This notebook automatically discovers and scrapes all South Asian cuisine related pages
from Wikipedia and saves them as a JSON corpus for use in the CuisineRAG system.

**Output:** `data/raw/south_asian_corpus.json`

## 1. Install & Import Libraries

In [1]:
import wikipediaapi
import json
import os
import re
import time
from urllib.parse import unquote

# Initialize Wikipedia API (replace with your real student email)
wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='CuisineRAG/1.0 (mingyi.jin@student.manchester.ac.uk)'
)

print('Libraries loaded successfully!')


Libraries loaded successfully!


## 2. Discover All South Asian Food Pages
We start from the main South Asian cuisine Wikipedia page and extract
all linked dish/food pages automatically.

In [2]:
# Entry point pages for South Asian cuisine
SEED_PAGES = [
    'South_Asian_cuisine',
    'Indian_cuisine',
    'Pakistani_cuisine',
    'Bangladeshi_cuisine',
    'Sri_Lankan_cuisine',
    'Nepalese_cuisine',
]

EXCLUDE_TITLE_KEYWORDS = {
    'template', 'list of', 'index of', 'disambiguation', 'category',
    'outline of', 'timeline of', 'history of', 'geography of', 'demographics'
}

FOOD_KEYWORDS = [
    'cuisine', 'food', 'dish', 'recipe', 'cooking', 'curry', 'rice', 'bread',
    'spice', 'biryani', 'masala', 'dal', 'roti', 'naan', 'chutney', 'kebab',
    'samosa', 'dessert', 'sweet', 'snack', 'drink', 'beverage', 'soup',
    'salad', 'chicken', 'lamb', 'mutton', 'lentil', 'vegetable', 'paneer'
]

SOUTH_ASIA_KEYWORDS = [
    'south asia', 'india', 'indian', 'pakistan', 'pakistani',
    'bangladesh', 'bangladeshi', 'sri lanka', 'sri lankan',
    'nepal', 'nepalese', 'kashmir', 'bengal', 'punjab',
    'tamil', 'mughlai', 'awadhi', 'hyderabadi'
]


def normalize_page_title(title: str) -> str:
    """Normalize wikipedia page keys into comparable title strings."""
    cleaned = unquote(title).replace('/wiki/', '').replace('_', ' ').strip()
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned


def is_food_title_candidate(page_title: str) -> bool:
    """Title-level filter to remove obvious non-food/non-article pages."""
    title = normalize_page_title(page_title).lower()
    if ':' in title:
        return False
    if any(bad in title for bad in EXCLUDE_TITLE_KEYWORDS):
        return False
    return any(k in title for k in FOOD_KEYWORDS)


def discover_links_from_seed(seed_title: str) -> set[str]:
    """Discover linked pages from a seed article via wikipediaapi links."""
    page = wiki.page(seed_title)
    if not page.exists():
        print(f'  ! Seed missing: {seed_title}')
        return set()

    discovered = set()
    for linked_title in page.links.keys():
        if is_food_title_candidate(linked_title):
            discovered.add(linked_title)
    return discovered


print('Discovering pages from seed pages...')
all_candidate_pages = set()
discovery_log = []

for seed in SEED_PAGES:
    print(f'  Scanning: {seed}')
    links = discover_links_from_seed(seed)
    all_candidate_pages.update(links)
    discovery_log.append({'seed': seed, 'discovered_candidates': len(links)})
    time.sleep(0.5)

print(f'\nTotal candidate pages found: {len(all_candidate_pages)}')


Discovering pages from seed pages...
  Scanning: South_Asian_cuisine
  Scanning: Indian_cuisine
  Scanning: Pakistani_cuisine
  Scanning: Bangladeshi_cuisine
  Scanning: Sri_Lankan_cuisine
  Scanning: Nepalese_cuisine

Total candidate pages found: 701


## 3. Filter Relevant Pages
Not all links are food related. We filter by checking if the page
title or content contains food/cuisine related keywords.

In [3]:
def scrape_wikipedia_page(page_title: str) -> dict | None:
    """
    Scrape a Wikipedia page and return structured data.
    Returns None for missing pages.
    """
    page = wiki.page(page_title)
    if not page.exists():
        return None

    text = f'# {page.title}\n\n'
    text += (page.summary or '') + '\n\n'

    for section in page.sections:
        text += f'## {section.title}\n'
        text += (section.text or '') + '\n\n'
        for subsection in section.sections:
            text += f'### {subsection.title}\n'
            text += (subsection.text or '') + '\n\n'

    return {
        'title': page.title,
        'source_title': page_title,
        'url': page.fullurl,
        'summary': page.summary,
        'text': text.strip(),
        'num_characters': len(text),
    }


def content_food_density(text: str) -> float:
    """Rough ratio of food-keyword hits in text; used as content-level filter."""
    lowered = text.lower()
    hits = sum(lowered.count(k) for k in FOOD_KEYWORDS)
    token_count = max(1, len(lowered.split()))
    return hits / token_count


def is_south_asia_relevant(title: str, summary: str, text: str) -> bool:
    """Require at least one South Asia signal in title/summary/body."""
    combined = f"{title}\n{summary}\n{text[:2000]}".lower()
    return any(k in combined for k in SOUTH_ASIA_KEYWORDS)


MIN_CHARS = 400
MIN_FOOD_DENSITY = 0.003

corpus = []
seen_urls = set()
crawl_log = {
    'config': {
        'seeds': SEED_PAGES,
        'min_chars': MIN_CHARS,
        'min_food_density': MIN_FOOD_DENSITY,
        'south_asia_keywords': SOUTH_ASIA_KEYWORDS,
    },
    'discovery': discovery_log,
    'accepted': [],
    'skipped': [],
    'errors': [],
}

food_pages = sorted(all_candidate_pages)
print(f'Scraping {len(food_pages)} candidate pages...\n')

for i, page_title in enumerate(food_pages, start=1):
    try:
        result = scrape_wikipedia_page(page_title)
        if not result:
            crawl_log['skipped'].append({'title': page_title, 'reason': 'page_not_found'})
            print(f'[{i}/{len(food_pages)}] - Skipped: {page_title} (not found)')
            continue

        density = content_food_density(result['text'])
        result['food_density'] = round(density, 6)

        if result['url'] in seen_urls:
            crawl_log['skipped'].append({'title': result['title'], 'reason': 'duplicate_url', 'url': result['url']})
            print(f'[{i}/{len(food_pages)}] - Skipped: {result["title"]} (duplicate URL)')
        elif result['num_characters'] < MIN_CHARS:
            crawl_log['skipped'].append({'title': result['title'], 'reason': 'too_short', 'num_characters': result['num_characters']})
            print(f'[{i}/{len(food_pages)}] - Skipped: {result["title"]} (too short)')
        elif density < MIN_FOOD_DENSITY:
            crawl_log['skipped'].append({'title': result['title'], 'reason': 'low_food_density', 'food_density': round(density, 6)})
            print(f'[{i}/{len(food_pages)}] - Skipped: {result["title"]} (low food density {density:.4f})')
        elif not is_south_asia_relevant(result['title'], result.get('summary', ''), result['text']):
            crawl_log['skipped'].append({'title': result['title'], 'reason': 'out_of_scope_south_asia'})
            print(f'[{i}/{len(food_pages)}] - Skipped: {result["title"]} (out of South Asia scope)')
        else:
            seen_urls.add(result['url'])
            corpus.append(result)
            crawl_log['accepted'].append({'title': result['title'], 'url': result['url'], 'num_characters': result['num_characters']})
            print(f'[{i}/{len(food_pages)}] + Accepted: {result["title"]} ({result["num_characters"]} chars)')

    except Exception as e:
        crawl_log['errors'].append({'title': page_title, 'error': str(e)})
        print(f'[{i}/{len(food_pages)}] ! Error: {page_title} -> {e}')

    time.sleep(0.3)

print(f'\nAccepted pages: {len(corpus)}')
print(f'Skipped pages : {len(crawl_log["skipped"])}')
print(f'Errors        : {len(crawl_log["errors"])}')


Scraping 701 candidate pages...

[1/701] - Skipped: Georgian cuisine (out of South Asia scope)
[2/701] - Skipped: Acadian cuisine (out of South Asia scope)
[3/701] + Accepted: Acehnese cuisine (4228 chars)
[4/701] - Skipped: Achari paneer tikka (not found)
[5/701] + Accepted: Naan (4825 chars)
[6/701] + Accepted: Afghan cuisine (10900 chars)
[7/701] - Skipped: Afghan cuisine (duplicate URL)
[8/701] + Accepted: African cuisine (24871 chars)
[9/701] - Skipped: Ainu cuisine (out of South Asia scope)
[10/701] - Skipped: Albanian cuisine (out of South Asia scope)
[11/701] - Skipped: Alcoholic beverage (out of South Asia scope)
[12/701] - Skipped: Alcoholic beverage (out of South Asia scope)
[13/701] - Skipped: Algerian cuisine (out of South Asia scope)
[14/701] - Skipped: American Chinese cuisine (out of South Asia scope)
[15/701] - Skipped: American Jewish cuisine (out of South Asia scope)
[16/701] - Skipped: American cuisine (out of South Asia scope)
[17/701] - Skipped: Ancient Egyptian c

## 4. Scrape Each Page
Loop through all discovered food pages and scrape their full text.
Text is formatted with markdown headers for use in chunking.

In [4]:
# Create output directory if it doesn't exist
os.makedirs('data/raw', exist_ok=True)

corpus_path = 'data/raw/south_asian_corpus.json'
log_path = 'data/raw/crawl_log.json'

with open(corpus_path, 'w', encoding='utf-8') as f:
    json.dump(corpus, f, ensure_ascii=False, indent=2)

with open(log_path, 'w', encoding='utf-8') as f:
    json.dump(crawl_log, f, ensure_ascii=False, indent=2)

print(f'Corpus saved to: {corpus_path}')
print(f'Crawl log saved to: {log_path}')
print(f'Total pages: {len(corpus)}')
print(f'Total characters: {sum(p["num_characters"] for p in corpus):,}')


Corpus saved to: data/raw/south_asian_corpus.json
Crawl log saved to: data/raw/crawl_log.json
Total pages: 257
Total characters: 2,711,165


## 5. Save Corpus to JSON
Save all scraped pages to `data/raw/south_asian_corpus.json`.
This file will be loaded by `chunking.ipynb`.

In [5]:
print('=== CORPUS SUMMARY ===')
if not corpus:
    print('No pages accepted. Consider lowering thresholds.')
else:
    total_chars = sum(p['num_characters'] for p in corpus)
    print(f'Total pages accepted : {len(corpus)}')
    print(f'Total characters     : {total_chars:,}')
    print(f'Avg page size        : {total_chars // len(corpus):,} chars')
    print(f'Largest page         : {max(corpus, key=lambda x: x["num_characters"])["title"]}')
    print(f'Smallest page        : {min(corpus, key=lambda x: x["num_characters"])["title"]}')

    print('\n=== SAMPLE PAGE TITLES ===')
    for page in corpus[:10]:
        print(f'  - {page["title"]} ({page["num_characters"]:,} chars, density={page.get("food_density", 0):.4f})')

    print('\n=== SAMPLE TEXT PREVIEW (first page) ===')
    print(corpus[0]['text'][:500])

print('\n=== CRAWL LOG SUMMARY ===')
print(f"Accepted entries : {len(crawl_log['accepted'])}")
print(f"Skipped entries  : {len(crawl_log['skipped'])}")
print(f"Error entries    : {len(crawl_log['errors'])}")


=== CORPUS SUMMARY ===
Total pages accepted : 257
Total characters     : 2,711,165
Avg page size        : 10,549 chars
Largest page         : Malaysian cuisine
Smallest page        : Thadal

=== SAMPLE PAGE TITLES ===
  - Acehnese cuisine (4,228 chars, density=0.1291)
  - Naan (4,825 chars, density=0.0863)
  - Afghan cuisine (10,900 chars, density=0.0826)
  - African cuisine (24,871 chars, density=0.0553)
  - Andhra cuisine (26,914 chars, density=0.0786)
  - Bihari cuisine (12,208 chars, density=0.0784)
  - Anglo-Indian cuisine (9,930 chars, density=0.0814)
  - Arunachali cuisine (1,232 chars, density=0.0829)
  - Asian cuisine (8,520 chars, density=0.0802)
  - Assamese cuisine (17,374 chars, density=0.0497)

=== SAMPLE TEXT PREVIEW (first page) ===
# Acehnese cuisine

Acehnese cuisine is the cuisine of the Acehnese people of Aceh in Sumatra, Indonesia. This cuisine is popular and widely known in Indonesia. Arab, Persian, and Indian traders influenced food culture in Aceh although flavo

## 6. Preview the Corpus
Quick look at what was scraped.

## 7. Next Step
The corpus is now saved at `data/raw/south_asian_corpus.json`.

Open `chunking.ipynb` and load this file to start chunking:
```python
with open('data/raw/south_asian_corpus.json', 'r') as f:
    corpus = json.load(f)
```