In [None]:
pip install wikipedia-api

In [3]:
import wikipediaapi


wiki_wiki = wikipediaapi.Wikipedia(
    user_agent='MyProjectName',
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

##  Get links for each category

In [8]:
from random import shuffle

def url_categorymembers(categorymembers, list_urls=[], list_pages=[], level=0, max_level=1):
    pages_cat = list(categorymembers.values()) #toutes les pages de la catégorie

    shuffle(pages_cat)

    i = 0 # nb de pages

    for c in pages_cat:
        if i==1000: #1000 pages par catégorie
            break

        if c.title[:6]!="Portal" and c.title[:8]!="Category": # vérifie que c'est bien un article
            i+=1
            list_urls.append(c.fullurl)
            list_pages.append(c)

        if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level: # descend d'un niveau
            url_categorymembers(c.categorymembers, list_urls, list_pages, level=level + 1, max_level=max_level)

    return list_urls, list_pages

In [9]:
cat_arts = wiki_wiki.page("Category:The arts") # Arts
arts, arts_pages = url_categorymembers(cat_arts.categorymembers)

In [None]:
cat_games = wiki_wiki.page("Category:Games") # Games
games, games_pages = url_categorymembers(cat_games.categorymembers)

In [None]:
cat_youth = wiki_wiki.page("Category:Youth") # Kids and Teens (not exact)
youth, youth_pages = url_categorymembers(cat_youth.categorymembers)

In [None]:
cat_reference = wiki_wiki.page("Category:Reference") # Reference
reference, reference_pages = url_categorymembers(cat_reference.categorymembers)

In [None]:
cat_shopping = wiki_wiki.page("Category:Shopping (activity)") # Shopping
shopping, shopping_pages = url_categorymembers(cat_shopping.categorymembers)

In [None]:
cat_business = wiki_wiki.page("Category:Business") # Business
business, business_pages = url_categorymembers(cat_business.categorymembers)

In [None]:
cat_health = wiki_wiki.page("Category:Health") # Health
health, health_pages = url_categorymembers(cat_health.categorymembers)

In [None]:
cat_news = wiki_wiki.page("Category:News") # News
news, news_pages = url_categorymembers(cat_news.categorymembers)

In [None]:
cat_geography = wiki_wiki.page("Category:Geography") # Regional (not exact)
geography, geography_pages = url_categorymembers(cat_geography.categorymembers)

In [None]:
cat_society = wiki_wiki.page("Category:Society") # Society
society, society_pages = url_categorymembers(cat_society.categorymembers)

In [None]:
cat_computers = wiki_wiki.page("Category:Computers") # Computers
computers, computers_pages = url_categorymembers(cat_computers.categorymembers)

In [None]:
cat_home = wiki_wiki.page("Category:Home") # Home
home, home_pages = url_categorymembers(cat_home.categorymembers)

In [None]:
cat_recreation = wiki_wiki.page("Category:Recreation") # Recreation
recreation, recreation_pages = url_categorymembers(cat_recreation.categorymembers)

In [None]:
cat_science = wiki_wiki.page("Category:Science") # Science
science, science_pages = url_categorymembers(cat_science.categorymembers)

In [None]:
cat_sports = wiki_wiki.page("Category:Sports") # Sports
sports, sports_pages = url_categorymembers(cat_sports.categorymembers)

In [None]:
cat_world = wiki_wiki.page("Category:World") # World
world, world_pages = url_categorymembers(cat_world.categorymembers)

In [None]:
topics = {
    "Arts": (arts, arts_pages),
    "Games": (games, games_pages),
    "Youth": (youth, youth_pages),
    "Reference": (reference, reference_pages),
    "Shopping": (shopping, shopping_pages),
    "Business": (business, business_pages),
    "Health": (health, health_pages),
    "News": (news, news_pages),
    "Geography": (geography, geography_pages),
    "Society": (society, society_pages),
    "Computers": (computers, computers_pages),
    "Home": (home, home_pages),
    "Recreation": (recreation, recreation_pages),
    "Science": (science, science_pages),
    "Sports": (sports, sports_pages),
    "World": (world, world_pages)
}

## Compute Page Rank vectors

In [10]:
import numpy as np

# extract links from a Wikipedia page
def page_links(page):
        links = page.links
        links2 = []
        for p in links.values():
            try:
                url = p.fullurl
            except:
                url = None
            if url is not None:
                links2.append(url)
        return links2

# build the adjacency matrix
def build_adjacency_matrix(urls, pages):
    num_pages = len(urls)
    adjacency_matrix = np.zeros((num_pages, num_pages))

    for i, url in enumerate(urls):
        links = page_links(pages[i])
        for link in links:
            if link in urls:
                j = urls.index(link)
                adjacency_matrix[i][j] = 1

    return adjacency_matrix

# compute PageRank scores
def compute_pagerank(adjacency_matrix, damping_factor=0.85):
    num_pages = adjacency_matrix.shape[0]
    teleportation_matrix = np.ones((num_pages, num_pages)) / num_pages
    topic_vector = np.ones(num_pages) / num_pages

    topic_sensitive_matrix = damping_factor * adjacency_matrix + (1 - damping_factor) * teleportation_matrix

    pagerank_scores = np.ones(num_pages) / num_pages
    old_pagerank_scores = np.zeros(num_pages)

    epsilon = 1.0e-5
    max_iterations = 100
    iterations = 0

    while np.sum(np.abs(pagerank_scores - old_pagerank_scores)) > epsilon and iterations < max_iterations:
        old_pagerank_scores = pagerank_scores.copy()
        pagerank_scores = np.dot(topic_sensitive_matrix, pagerank_scores)
        iterations += 1

    return pagerank_scores




In [None]:
# Compute PageRank scores for each topic

topic_pagerank_scores = {}
for topic, (urls, pages) in topics.items():
    adjacency_matrix = build_adjacency_matrix(urls, pages)
    pagerank_scores = compute_pagerank(adjacency_matrix)
    topic_pagerank_scores[topic] = pagerank_scores

# Print PageRank scores for each topic
for topic, scores in topic_pagerank_scores.items():
    print(f"Topic: {topic}")
    for url, score in zip(topics[topic], scores):
        print(f"{url}: {score}")
    print()

In [12]:
build_adjacency_matrix(arts,arts_pages)