## Scraping WikiVoyage

In [9]:
from bs4 import BeautifulSoup
import requests
import json
import re

In [10]:
API_URL = "https://en.wikivoyage.org/w/api.php"

In [11]:
def fetch_page_extract(page_title):
    """
    Fetch the full extract (plain text) of a WikiVoyage page using MediaWiki API
    """
    params = {
        'action': 'query',
        'format': 'json',
        'titles': page_title,
        'prop': 'extracts',
        'explaintext': True,  # Get plain text without HTML
        'exsectionformat': 'wiki'  # Keep section markers
    }
    
    headers = {
        'User-Agent': 'ShivYatra Travel App (Educational Purpose)'
    }
    
    try:
        response = requests.get(API_URL, params=params, headers=headers)
        response.raise_for_status()
        
        data = response.json()
        pages = data['query']['pages']
        
        # Get the first (and only) page
        page_id = list(pages.keys())[0]
        page_data = pages[page_id]
        
        if 'extract' in page_data:
            return {
                'title': page_data['title'],
                'extract': page_data['extract']
            }
        else:
            print("No extract found for the page")
            return None
            
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None
    except KeyError as e:
        print(f"Unexpected response format: {e}")
        return None

In [12]:
def parse_sections_from_extract(extract_text):
    """
    Parse sections from the plain text extract using regex to find section headers
    """
    if not extract_text:
        return {}
    
    # Split by section headers (== Section Name == or === Section Name ===)
    section_pattern = r'^=+\s*(.+?)\s*=+$'
    
    sections = {}
    current_section = "Introduction"
    current_content = []
    
    lines = extract_text.split('\n')
    
    for line in lines:
        # Check if this line is a section header
        match = re.match(section_pattern, line.strip())
        
        if match:
            # Save the previous section
            if current_content:
                sections[current_section] = '\n'.join(current_content).strip()
            
            # Start new section - extract and clean the section name
            raw_section_name = match.group(1).strip()
            # Remove any = characters that might be in the section name itself
            clean_section_name = re.sub(r'^=+\s*|\s*=+$', '', raw_section_name).strip()
            current_section = clean_section_name
            current_content = []
        else:
            # Add line to current section (skip empty lines at the start)
            if line.strip() or current_content:
                current_content.append(line)
    
    # Don't forget the last section
    if current_content:
        sections[current_section] = '\n'.join(current_content).strip()
    
    return sections

In [23]:
def scrape_wikivoyage_page(page_title):
    """
    Main function to scrape a WikiVoyage page and return structured data
    """
    print(f"Scraping WikiVoyage page: {page_title}")
    
    # Fetch the page extract
    page_data = fetch_page_extract(page_title)
    
    if not page_data:
        print("Failed to fetch page data")
        return None
    
    print(f"Successfully fetched page: {page_data['title']}")
    print(f"Extract length: {len(page_data['extract'])} characters")
    
    # Parse sections from the extract
    sections = parse_sections_from_extract(page_data['extract'])
    
    print(f"Found {len(sections)} sections")
    print("Sections:", list(sections.keys())[:10], "..." if len(sections) > 10 else "")
    
    # Structure the final data
    structured_data = {
        'title': page_data['title'],
        'source': f"https://en.wikivoyage.org/wiki/{page_title.replace(' ', '_')}",
        'sections': sections,
        'total_sections': len(sections),
        'section_names': list(sections.keys())
    }
    
    return structured_data


def preview_scraped_result(result):
    if result:
        print(f"\nSuccessfully read {result['title']}")
        print(f"Total sections: {result['total_sections']}")
    
        # Show first few sections as preview
        print("\nFirst 5 sections:")
        for i, (section_name, content) in enumerate(list(result['sections'].items())[:5]):
            preview = content[:200] + "..." if len(content) > 200 else content
            print(f"{i+1}. {section_name}: {preview}")
    else:
        print("Failed to preview the scraped page")

    
def scrape_to_json(result):
    if result:
        output_filename = f"{result['title'].lower()}.json"
    
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
    
        print(f"\nData saved to: {output_filename}")
        print(f"File size: {len(json.dumps(result, indent=2))} characters")
    
        # Display some statistics
        # section_lengths = {name: len(content) for name, content in result['sections'].items()}
        # longest_section = max(section_lengths.items(), key=lambda x: x[1])
        # shortest_section = min(section_lengths.items(), key=lambda x: x[1])
    
        # print(f"\nContent Statistics:")
        # print(f"Longest section: '{longest_section[0]}' ({longest_section[1]} chars)")
        # print(f"Shortest section: '{shortest_section[0]}' ({shortest_section[1]} chars)")
        # print(f"Average section length: {sum(section_lengths.values()) // len(section_lengths)} chars")



In [24]:
# Scrape all locations in a list 

def scrape_list(locations):
    for region in locations:
        region_data = scrape_wikivoyage_page(region)
        scrape_to_json(region_data)

In [6]:
# Scrape the India page
IND = "India"

result = scrape_wikivoyage_page(IND)

preview_scraped_result(result)

Scraping WikiVoyage page: India
Successfully fetched page: India
Extract length: 220537 characters
Found 119 sections
Sections: ['Introduction', 'Regions', 'Cities', 'Other destinations', 'Understand', 'History', 'Politics', 'Time zone', 'Geography', 'Climate'] ...

Successfully read India
Total sections: 119

First 5 sections:
1. Introduction: India (Hindi: भारत or Bhārat), the largest country in South Asia, has many of the world's highest mountains, most populated cities, and longest rivers. As one of the great civilisations of the ancient...
2. Regions: India is administratively divided into 28 states and 8 union territories. The states are broadly demarcated on linguistic lines. They vary in size; the larger ones are bigger and more diverse than som...
3. Cities: These are some of India's most notable cities. Other cities can be found under their specific regions.

1 Delhi — the capital of India and the heart of Northern India
2 Bangalore (Bengaluru) — the bea...
4. Other destinati

In [7]:
scrape_to_json(result)


Data saved to: india_wikivoyage.json
File size: 214508 characters

Content Statistics:
Longest section: 'History' (11332 chars)
Shortest section: 'Tourist information' (32 chars)
Average section length: 1740 chars


In [46]:
# Clean section names by removing = characters from section headers
def clean_section_names(sections_dict):
    """
    Clean up section names by removing = characters
    """
    cleaned_sections = {}
    
    for section_name, content in sections_dict.items():
        # Remove = characters from the beginning and end of section names
        clean_name = re.sub(r'^=+\s*|\s*=+$', '', section_name).strip()
        # If the name is now empty, keep the original
        if not clean_name:
            clean_name = section_name
            
        cleaned_sections[clean_name] = content
    
    return cleaned_sections

# Clean the scraped data
if result:
    print("Cleaning section names...")
    
    # Clean the sections
    result['sections'] = clean_section_names(result['sections'])
    result['section_names'] = list(result['sections'].keys())
    
    # Save the final cleaned version
    output_filename_final = f"{PAGE_TITLE.lower()}_wikivoyage_clean.json"
    
    with open(output_filename_final, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Final clean data saved to: {output_filename_final}")
    print(f"Total sections: {len(result['section_names'])}")
    print(f"All section names cleaned successfully!")

Cleaning section names...
✅ Final clean data saved to: india_wikivoyage_clean.json
Total sections: 119
All section names cleaned successfully!


In [10]:
HIM_NORTH = "Himalayan_North"
him_north = scrape_wikivoyage_page(HIM_NORTH)

preview_scraped_result(him_north)

Scraping WikiVoyage page: Himalayan_North
Successfully fetched page: Himalayan North
Extract length: 8359 characters
Found 14 sections
Sections: ['Introduction', 'States and union territories', 'Cities', 'Other destinations', 'Understand', 'Talk', 'By plane', 'By bus', 'By train', 'See'] ...

Successfully read Himalayan North
Total sections: 14

First 5 sections:
1. Introduction: The Himalayan North region comprises the two states and two union territories of India at the northernmost Himalayan mountains: Himachal Pradesh, Jammu and Kashmir, Ladakh and Uttarakhand. Eastern par...
2. States and union territories: Azad Kashmir and Gilgit-Baltistan are claimed by India to be parts of Jammu and Kashmir and Ladakh respectively, although they are administered by Pakistan, which conversely claims Jammu and Kashmir a...
3. Cities: Here are nine of the most notable cities.

1 Bharmour — famous for Chaurasi Temple and the Hindu pilgrimage of Mani Mahesh
2NOT EXISTING IMAGE Dehradun — the capital

In [11]:
scrape_to_json(him_north)


Data saved to: himalayan north.json
File size: 8715 characters

Content Statistics:
Longest section: 'Other destinations' (1259 chars)
Shortest section: 'Talk' (164 chars)
Average section length: 565 chars


In [12]:
him_north_list = ["Himachal_Pradesh", "Jammu_and_Kashmir", "Ladakh", "Uttarakhand"]

for region in him_north_list:
    region_data = scrape_wikivoyage_page(region)
    scrape_to_json(region_data)

Scraping WikiVoyage page: Himachal_Pradesh
Successfully fetched page: Himachal Pradesh
Extract length: 10724 characters
Found 17 sections
Sections: ['Introduction', 'Cities', 'Other destinations', 'Understand', 'Talk', 'By plane', 'By train', 'By bus', 'Get around', 'By motorcycle'] ...

Data saved to: himachal pradesh.json
File size: 10477 characters

Content Statistics:
Longest section: 'Do' (1860 chars)
Shortest section: 'Go next' (35 chars)
Average section length: 555 chars
Scraping WikiVoyage page: Jammu_and_Kashmir
Successfully fetched page: Jammu and Kashmir
Extract length: 12499 characters
Found 19 sections
Sections: ['Introduction', 'Cities', 'Other destinations', 'History', 'Geography', 'Talk', 'By plane', 'By train', 'By car', 'By bus'] ...

Data saved to: jammu and kashmir.json
File size: 12990 characters

Content Statistics:
Longest section: 'See' (2961 chars)
Shortest section: 'By car' (87 chars)
Average section length: 624 chars
Scraping WikiVoyage page: Ladakh
Successfu

In [6]:
UK = "Uttarakhand"

In [18]:
himachal = ["Shimla", "Dharamsala", "Bilaspur_(Himachal_Pradesh)", "Dalhousie_(India)", "Jogindernagar", "Kullu", "Manali", "Mandi", "Palampur"]

scrape_list(himachal)

Scraping WikiVoyage page: Shimla
Successfully fetched page: Shimla
Extract length: 41815 characters
Found 33 sections
Sections: ['Introduction', 'Understand', 'Tourist information', 'Orientation', 'Climate', 'By bus', 'By train', 'By road', 'By plane', 'Get around'] ...

Data saved to: shimla.json
File size: 41884 characters

Content Statistics:
Longest section: 'Mid-range' (5412 chars)
Shortest section: 'By road' (61 chars)
Average section length: 1202 chars
Scraping WikiVoyage page: Dharamsala
Successfully fetched page: Dharamsala
Extract length: 40451 characters
Found 30 sections
Sections: ['Introduction', 'Understand', 'Climate', 'By plane', 'By bus', 'By train', 'By taxi', 'Orientation', 'By taxi or rickshaw', 'By cable car'] ...

Data saved to: dharamsala.json
File size: 34915 characters

Content Statistics:
Longest section: 'Meeting the Dalai Lama' (3817 chars)
Shortest section: 'By plane' (117 chars)
Average section length: 1092 chars
Scraping WikiVoyage page: Bilaspur_(Himacha

In [None]:
jammu = [ "Jammu", "Srinagar", "Gulmarg", "Katra", "Pahalgam", "Patnitop", "Sonamarg"]

scrape_list(jammu)

Scraping WikiVoyage page: Katra
Successfully fetched page: Katra
Extract length: 5509 characters
Found 10 sections
Sections: ['Introduction', 'By plane', 'By train', 'Get around', 'Vaishno Devi Temple', 'Shiv Khori', 'Eat and drink', 'Sleep', 'Nearby', 'Go next'] 

Data saved to: katra.json
File size: 5948 characters
Scraping WikiVoyage page: Pahalgam
Successfully fetched page: Pahalgam
Extract length: 7189 characters
Found 13 sections
Sections: ['Introduction', 'By plane', 'By train', 'By hired taxi', 'By bus', 'Get around', 'See', 'Do', 'Buy', 'Eat'] ...

Data saved to: pahalgam.json
File size: 7653 characters
Scraping WikiVoyage page: Patnitop
Successfully fetched page: Patnitop
Extract length: 4970 characters
Found 11 sections
Sections: ['Introduction', 'By share taxi', 'By train', 'Get around', 'See', 'Do', 'Buy', 'Eat', 'Drink', 'Sleep'] ...

Data saved to: patnitop.json
File size: 5353 characters
Scraping WikiVoyage page: Sonamarg
Successfully fetched page: Sonamarg
Extract leng

In [26]:
ladakh = ["Leh", "Alchi", "Changthang_Western_Lakes#Q18111277", "Nubra_Valley#Q25247408", "Hanle", "Nubra_Valley#Q24911885", "Nubra_Valley#Q24911885", "Kargil", "Changthang_Western_Lakes#Q25247383", "Lamayuru"]

scrape_list(ladakh)

Scraping WikiVoyage page: Leh
Successfully fetched page: Leh
Extract length: 31967 characters
Found 34 sections
Sections: ['Introduction', 'Understand', 'By plane', 'By bus', 'From Jammu and Kashmir', 'From Himachal Pradesh', 'By road', 'By taxi', 'By thumb', 'By truck'] ...

Data saved to: leh.json
File size: 25350 characters
Scraping WikiVoyage page: Alchi
Successfully fetched page: Alchi
Extract length: 2218 characters
Found 10 sections
Sections: ['Introduction', 'Get in', 'Get around', 'See', 'Do', 'Buy', 'Eat', 'Drink', 'Sleep', 'Go next'] 

Data saved to: alchi.json
File size: 2536 characters
Scraping WikiVoyage page: Changthang_Western_Lakes#Q18111277
Successfully fetched page: Changthang Western Lakes
Extract length: 5764 characters
Found 17 sections
Sections: ['Introduction', 'Cities', 'Other destinations', 'Understand', 'Get in', 'By thumb', 'By tour', 'On foot', 'Fees and permits', 'By road'] ...

Data saved to: changthang western lakes.json
File size: 5784 characters
Scrapi

In [27]:
uttarakhand = ["Dehradun", "Almora", "Haridwar", "Mukteshwar", "Mussoorie", "Nainital", "Pithoragarh", "Rishikesh", "Badrinath", "Chakrata", "Jim_Corbett_National_Park", "Dunagiri", "Gangotri", "Kedarnath", "Munsyari", "Rajaji_National_Park", "Nanda_Devi_National_Park", "Ghangaria"]

scrape_list(uttarakhand)

Scraping WikiVoyage page: Dehradun
Successfully fetched page: Dehradun
Extract length: 25903 characters
Found 26 sections
Sections: ['Introduction', 'Understand', 'Climate', 'By plane', 'By train', 'By bus', 'By car', 'By auto rickshaw', 'By shared (fixed-route) autorickshaws', 'On foot'] ...

Data saved to: dehradun.json
File size: 26143 characters
Scraping WikiVoyage page: Almora
Successfully fetched page: Almora
Extract length: 6879 characters
Found 15 sections
Sections: ['Introduction', 'Understand', 'History', 'Get in', 'By train', 'By road', 'By plane', 'See', 'Do', 'Buy'] ...

Data saved to: almora.json
File size: 7292 characters
Scraping WikiVoyage page: Haridwar
Successfully fetched page: Haridwar
Extract length: 20163 characters
Found 21 sections
Sections: ['Introduction', 'Understand', 'Municipal information', 'Get in', 'By plane', 'By train', 'By bus', 'Get around', 'See', 'Do'] ...

Data saved to: haridwar.json
File size: 21569 characters
Scraping WikiVoyage page: Mukteshw