In [1]:
# ==============================================================================
# Cell 1: Import Libraries
# ==============================================================================
import wikipediaapi
import pandas as pd
import os
import time

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# ==============================================================================
# Cell 2: Configuration - SET YOUR TOPIC HERE
# ==============================================================================
# --- CONFIGURATION ---
# To analyze a different topic, change the variable below.
# Note: Use the exact title from the Wikipedia URL (e.g. 'Data_science').
SEED_TOPIC = 'Renaissance'

# --- CRAWL PARAMETERS ---
# DEPTH: How many layers to crawl.
#   - Depth 1: Just the links on the SEED_TOPIC page.
#   - Depth 2: The SEED_TOPIC, its links, AND all of their links.
CRAWL_DEPTH = 2

# LIMIT: To keep the crawl fast, we can limit how many pages we analyze at the first level.
# Set to None to crawl all links (can be very slow!).
# A limit of 100 is a good starting point.
PAGE_LIMIT = 100

# Define where the output file will be saved
OUTPUT_FOLDER = '../data'
FILE_NAME = f'{SEED_TOPIC}_edges_depth_{CRAWL_DEPTH}.csv'
OUTPUT_PATH = os.path.join(OUTPUT_FOLDER, FILE_NAME)

# Create the output folder if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

print("Configuration set.")
print(f"Seed Topic: {SEED_TOPIC}")
print(f"Crawl Depth: {CRAWL_DEPTH}")
print(f"Page Limit (at depth 1): {PAGE_LIMIT}")
print(f"Output will be saved to: {OUTPUT_PATH}")

# Define the path to save our data file
OUTPUT_FOLDER = '../data'
FILE_NAME = f'{SEED_TOPIC}_edges.csv'
OUTPUT_PATH = os.path.join(OUTPUT_FOLDER, FILE_NAME)

# Create the data directory if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)


print(f"Seed Topic: {SEED_TOPIC}")
print(f"Data will be saved to: {OUTPUT_PATH}")

Configuration set.
Seed Topic: Renaissance
Crawl Depth: 2
Page Limit (at depth 1): 100
Output will be saved to: ../data/Renaissance_edges_depth_2.csv
Seed Topic: Renaissance
Data will be saved to: ../data/Renaissance_edges.csv


In [3]:
# ==============================================================================
# Cell 3: Initialize the Wikipedia API
# ==============================================================================
# Initialize the Wikipedia API with a custom user agent
# This is a good practice to identify your script to Wikipedia's servers.
wiki_api = wikipediaapi.Wikipedia(
    language='en',
    user_agent='MyNetworkAnalysisProject/1.0 (pathmohd123@gmail.com)'
)

print("\nWikipedia API client initialized.")


Wikipedia API client initialized.


In [4]:
# ==============================================================================
# Cell 4: Perform Deep Data Crawl
# ==============================================================================
print(f"\n--- Starting Deep Crawl for '{SEED_TOPIC}' ---")

# Use a set to store edges to avoid duplicates
edges = set()

# Fetch the starting page
seed_page = wiki_api.page(SEED_TOPIC)

if not seed_page.exists():
    print(f"Error: Seed page '{SEED_TOPIC}' does not exist.")
else:
    print(f"Fetching links from Level 0 page: {seed_page.title}")
    
    # Get the first level of links
    level_one_links = list(seed_page.links.values())
    
    # Apply the limit if it's set
    if PAGE_LIMIT is not None:
        print(f"Applying limit: processing first {PAGE_LIMIT} links out of {len(level_one_links)}.")
        level_one_links = level_one_links[:PAGE_LIMIT]

    # Process level one links (pages linked directly from the seed topic)
    for i, linked_page in enumerate(level_one_links):
        # Add the edge from the seed page to this linked page
        if linked_page.namespace == 0: # Ensure it's a standard article
            edges.add((seed_page.title, linked_page.title))
        
        # If depth is 2, go deeper and get links from this page
        if CRAWL_DEPTH >= 2:
            print(f"  [{i+1}/{len(level_one_links)}] Fetching links from Level 1 page: {linked_page.title}...")
            try:
                # Add a small delay to be polite to Wikipedia's servers
                time.sleep(0.01)
                
                # Get all links from the current linked_page
                level_two_links = linked_page.links
                for sub_link_title in level_two_links:
                    if level_two_links[sub_link_title].namespace == 0:
                         edges.add((linked_page.title, sub_link_title))
            except Exception as e:
                print(f"    Could not process page {linked_page.title}. Error: {e}")

    print(f"\n✅ Crawl complete. Found {len(edges)} unique edges.")


--- Starting Deep Crawl for 'Renaissance' ---
Fetching links from Level 0 page: Renaissance
Applying limit: processing first 100 links out of 2098.
  [1/100] Fetching links from Level 1 page: "Polish death camp" controversy...
  [2/100] Fetching links from Level 1 page: 1948 Palestine war...
  [3/100] Fetching links from Level 1 page: 2,500-year celebration of the Persian Empire...
  [4/100] Fetching links from Level 1 page: 2022 Russian invasion of Ukraine...
  [5/100] Fetching links from Level 1 page: 20th-century Western painting...
  [6/100] Fetching links from Level 1 page: A. L. Rowse...
  [7/100] Fetching links from Level 1 page: ABCANZ Armies...
  [8/100] Fetching links from Level 1 page: ANZUK...
  [9/100] Fetching links from Level 1 page: ANZUS...
  [10/100] Fetching links from Level 1 page: AUKUS...
  [11/100] Fetching links from Level 1 page: AUSCANNZUKUS...
  [12/100] Fetching links from Level 1 page: Abel Tasman...
  [13/100] Fetching links from Level 1 page: Abolitionis

In [5]:
# ==============================================================================
# Cell 5: Save the Data to a CSV File
# ==============================================================================
if edges:
    # Convert the set of edges to a list of dictionaries for the DataFrame
    edge_list = [{'source': source, 'target': target} for source, target in edges]
    
    # Create a pandas DataFrame
    df_edges = pd.DataFrame(edge_list)
    
    # Save the DataFrame to a CSV file
    df_edges.to_csv(OUTPUT_PATH, index=False)
    
    print(f"\nData saved successfully to {OUTPUT_PATH}")
    print("\n--- First 5 links ---")
    print(df_edges.head())
else:
    print("\nNo edges were collected. Nothing to save.")


Data saved successfully to ../data/Renaissance_edges.csv

--- First 5 links ---
                     source                            target
0  Ancient Roman philosophy                 Korean philosophy
1  Ancient Greek philosophy  History of philosophy in Finland
2                Al-Andalus                             Jizya
3                Al-Andalus   Dictablanda of Dámaso Berenguer
4              Afrofuturism               Compton Crook Award


In [6]:
print(df_edges.shape)

(67051, 2)
