# Scraping Countries from Wikipedia HTML

This notebook extracts a list of countries from the saved Wikipedia HTML file and saves it as a CSV.

In [None]:
# Import required libraries
import pandas as pd
from bs4 import BeautifulSoup
import re

In [None]:
# Load the HTML file
with open('List_of_countries.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')
print("HTML file loaded and parsed successfully!")

In [None]:
# Find all country links in the main Countries section
countries = []

# Look for the main Countries section
countries_section = soup.find('section', {'class': 'mf-section-1'})

if countries_section:
    # Find all links that are country names (they have title attributes)
    country_links = countries_section.find_all('a', href=True, title=True)
    
    for link in country_links:
        # Skip edit links and other non-country links
        if '/wiki/' in link['href'] and 'action=edit' not in link['href']:
            country_name = link.get_text().strip()
            if country_name and country_name not in ['change', 'edit']:
                countries.append(country_name)

# Also check other sections that might contain countries
all_sections = soup.find_all('section', class_=re.compile('mf-section'))

for section in all_sections:
    # Look for country links in each section
    country_links = section.find_all('a', href=True, title=True)
    
    for link in country_links:
        if '/wiki/' in link['href'] and 'action=edit' not in link['href']:
            country_name = link.get_text().strip()
            if (country_name and 
                country_name not in ['change', 'edit'] and 
                country_name not in countries and
                len(country_name) > 2):  # Filter out very short names
                countries.append(country_name)

# Remove duplicates while preserving order
unique_countries = []
seen = set()
for country in countries:
    if country not in seen:
        unique_countries.append(country)
        seen.add(country)

print(f"Found {len(unique_countries)} unique countries")
print("First 10 countries:", unique_countries[:10])

In [None]:
# Create a DataFrame
df = pd.DataFrame(unique_countries, columns=['Country'])

# Add an index column
df.reset_index(drop=True, inplace=True)
df.index = df.index + 1  # Start index from 1

# Display the first few rows
print("Countries DataFrame:")
print(df.head(10))
print(f"\nTotal countries: {len(df)}")

In [None]:
# Save to CSV
csv_filename = 'countries_list.csv'
df.to_csv(csv_filename, index=True, index_label='ID')

print(f"Countries list saved to {csv_filename}")
print(f"File contains {len(df)} countries")

In [None]:
# Display a sample of the saved data
print("Sample of saved data:")
sample_df = pd.read_csv(csv_filename)
print(sample_df.head(15))
print("...")
print(sample_df.tail(5))