In [1]:
# ==============================================================================
# Cell 1: Import Libraries
# ==============================================================================
import wikipediaapi
import pandas as pd
import os

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# ==============================================================================
# Cell 2: Configuration - SET YOUR TOPIC HERE
# ==============================================================================
# --- CONFIGURATION ---
# To analyze a different topic, change the variable below.
# Note: Use the exact title from the Wikipedia URL (e.g., 'Artificial_intelligence', 'Data_science').
SEED_TOPIC = 'Artificial_intelligence'

# Define the path to save our data file
OUTPUT_FOLDER = '../data'
FILE_NAME = f'{SEED_TOPIC}_edges.csv'
OUTPUT_PATH = os.path.join(OUTPUT_FOLDER, FILE_NAME)

# Create the data directory if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)


print(f"Seed Topic: {SEED_TOPIC}")
print(f"Data will be saved to: {OUTPUT_PATH}")

Seed Topic: Artificial_intelligence
Data will be saved to: ../data/Artificial_intelligence_edges.csv


In [3]:
# ==============================================================================
# Cell 3: Initialize the Wikipedia API
# ==============================================================================
# Initialize the Wikipedia API with a custom user agent
# This is a good practice to identify your script to Wikipedia's servers.
wiki_api = wikipediaapi.Wikipedia(
    language='en',
    user_agent='MyNetworkAnalysisProject/1.0 (pathmohd123@gmail.com)'
)

print("\nWikipedia API client initialized.")


Wikipedia API client initialized.


In [4]:
# ==============================================================================
# Cell 4: Fetch Links from the Seed Article
# ==============================================================================
print(f"\nFetching Wikipedia page for '{SEED_TOPIC}'...")

# 1. Get the Wikipedia page object
page = wiki_api.page(SEED_TOPIC)

edge_list = []

# 2. Check if the page exists
if not page.exists():
    print(f"Error: The page '{SEED_TOPIC}' does not exist on English Wikipedia.")
else:
    print(f"Page found: '{page.title}'")
    
    # 3. Get all the unique links from the page
    links = page.links
    
    # 4. Create a list of edge pairs (source, target)
    for link_title, link_page in links.items():
        # We can add a filter here if we want, but for now, we'll take all links.
        # For example, to avoid special pages: if ":" not in link_title:
        edge_list.append((SEED_TOPIC, link_title))
        
    print(f"Successfully collected {len(edge_list)} links.")


Fetching Wikipedia page for 'Artificial_intelligence'...
Page found: 'Artificial intelligence'
Successfully collected 1881 links.


In [5]:
# ==============================================================================
# Cell 5: Save the Data to a CSV File
# ==============================================================================
# Convert the list to a pandas DataFrame and save it
if not edge_list:
    print("\nNo links were found, so no data was saved.")
else:
    # Create the DataFrame
    df_edges = pd.DataFrame(edge_list, columns=['source', 'target'])
    
    # Save the DataFrame to a CSV file
    df_edges.to_csv(OUTPUT_PATH, index=False)
    
    print(f"\nData saved successfully to {OUTPUT_PATH}")
    
    # Display the first 5 rows of the created file
    print("\n--- First 5 links ---")
    print(df_edges.head())


Data saved successfully to ../data/Artificial_intelligence_edges.csv

--- First 5 links ---
                    source                         target
0  Artificial_intelligence                          15.ai
1  Artificial_intelligence          2001: A Space Odyssey
2  Artificial_intelligence  2001: A Space Odyssey (novel)
3  Artificial_intelligence   2024 Indian general election
4  Artificial_intelligence        3D optical data storage
