# Downlad Wikipedia Dataset

Use as a template for downloading other datasets

In [None]:
import wikipediaapi
import pandas as pd
from tqdm import tqdm

# Initialize Wikipedia API with a proper User-Agent
wiki = wikipediaapi.Wikipedia(
    user_agent="WikipediaScraper/1.0 (replacewithemail@email.com)",  
    language="en"
)

def get_category_pages(category_name, max_articles=10):
    """
    Retrieves articles from a Wikipedia category.
    """
    category = wiki.page(f"Category:{category_name}")
    articles = {}

    # Recursively fetch pages from category members
    def fetch_pages(category_page):
        for page_title, page in category_page.categorymembers.items():
            if page.ns == wikipediaapi.Namespace.MAIN and page_title not in articles:
                articles[page_title] = page.summary
                if len(articles) >= max_articles:  # Stop when limit is reached
                    return
            elif page.ns == wikipediaapi.Namespace.CATEGORY:
                fetch_pages(page)  # Recursively fetch subcategories

    fetch_pages(category)
    return articles

# Define categories to scrape (You can add more!)
categories = ["Artificial intelligence", "Machine learning", "Computer vision", 
              "League of Legends", "Legends of Runeterra", "Ku Lo Sa"]

# Store articles
all_articles = {}

for category in tqdm(categories, desc="Downloading Wikipedia Categories"):
    articles = get_category_pages(category, max_articles=50)  # Fetch up to 50 articles per category
    all_articles.update(articles)

# Convert to DataFrame with new column names
df = pd.DataFrame(list(all_articles.items()), columns=["Article", "Article Info"])

# Save to CSV
df.to_csv("wikipedia_category_articles.csv", index=False)

print("✅ Wikipedia dataset saved as 'wikipedia_category_articles.csv'.")


Downloading Wikipedia Categories: 100%|██████████| 6/6 [00:40<00:00,  6.76s/it]

✅ Wikipedia dataset saved as 'wikipedia_category_articles.csv'.





# Clean Dataset


In [2]:
import pandas as pd
import re

def clean_text(text):
    """Remove special characters, extra spaces, and ensure consistent formatting."""
    if pd.isna(text):  # Handle NaN values
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\"\s-]", "", text)  # Keep only alphanumeric, punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

def clean_dataset(csv_path):
    """Clean the dataset and overwrite the same CSV file."""
    df = pd.read_csv(csv_path)

    # ✅ Apply cleaning function to both columns
    df["Article"] = df["Article"].apply(clean_text)
    df["Article Info"] = df["Article Info"].apply(clean_text)

    # ✅ Overwrite the original file
    df.to_csv(csv_path, index=False)
    print(f"✅ Cleaned dataset saved (overwritten) to {csv_path}")

# Example usage
clean_dataset("wikipedia_category_articles.csv")


✅ Cleaned dataset saved (overwritten) to wikipedia_category_articles.csv
