# Wikipedia API basic test
The purpose of this notebook is to test wikipedia's API capabilities.

## Configuration

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()


## TEST1: Calling today's featured article

In [None]:
# Get today's date in YYYY/MM/DD format.
import datetime

today = datetime.datetime.now()
date = today.strftime('%Y/%m/%d')

# Choose your language, and get today's featured content.
import requests

language_code = 'en' # English
headers = {
    'Authorization': 'Bearer ' + os.getenv("WP_ACCESS_TOKEN"),
    'User-Agent': 'juancreyes201@gmail.com'
}

base_url = 'https://api.wikimedia.org/feed/v1/wikipedia/'
url = base_url + language_code + '/featured/' + date
response = requests.get(url, headers=headers)

In [3]:
# Get the featured article's title, URL, extract, and thumbnail.
import json
import pandas as pd
response = json.loads(response.text)

display_title = response['tfa']['titles']['display']
desktop_url = response['tfa']['content_urls']['desktop']['page']
extract_html = response['tfa']['extract_html']
thumbnail_url = response['tfa']['thumbnail']['source']
data = {
    'display_title': display_title,
    'desktop_url': desktop_url,
    'extract_html': extract_html,
    'thumbnail_url': thumbnail_url
}
df = pd.DataFrame(data, index=[0])
display(df)

Unnamed: 0,display_title,desktop_url,extract_html,thumbnail_url
0,"<span class=""mw-page-title-main"">Tesla Model S...",https://en.wikipedia.org/wiki/Tesla_Model_S,<p>The <b>Tesla Model S</b> is a battery-elect...,https://upload.wikimedia.org/wikipedia/commons...


## TEST2: Obtain a subject using API

### Known subject with article

In [108]:
import requests
language_code = 'en'
headers = {
    'Authorization': os.getenv("WP_ACCESS_TOKEN"),
}
search_query = 'Paul Krugman'
number_of_results = 1
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': search_query, 'limit': number_of_results}
rs = requests.get(url, headers=headers, params=parameters)
print(rs.text)

{"pages":[{"id":313701,"key":"Paul_Krugman","title":"Paul Krugman","excerpt":"<span class=\"searchmatch\">Paul</span> Robin <span class=\"searchmatch\">Krugman</span> (/ˈkrʊɡmən/ KRUUG-mən; born February 28, 1953) is an American New Keynesian economist who is the Distinguished Professor of Economics","matched_title":null,"anchor":null,"description":"American economist (born 1953)","thumbnail":{"mimetype":"image/jpeg","width":60,"height":80,"duration":null,"url":"//upload.wikimedia.org/wikipedia/commons/thumb/9/9c/P20230814AS-0367_%28cropped%29.jpg/60px-P20230814AS-0367_%28cropped%29.jpg"}}]}


In [109]:

# Get article title, description, and URL from the search results
import json

response = rs.json()
for page in response['pages']:
    display_title = page['title']
    article_url = 'https://' + language_code + '.wikipedia.org/wiki/' + page['key']

    try:
        article_description = page['description']
    except:
        article_description = 'a Wikipedia article'
    try:
        thumbnail_url = 'https:' + page['thumbnail']['url']
    except:
        thumbnail_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/80/Wikipedia-logo-v2.svg/200px-Wikipedia-logo-v2.svg.png'
#Testing status code id
print(rs.status_code == 200)

# Print the results
print(response)
print(article_url)
print(article_description)
print(display_title)


True
{'pages': [{'id': 313701, 'key': 'Paul_Krugman', 'title': 'Paul Krugman', 'excerpt': '<span class="searchmatch">Paul</span> Robin <span class="searchmatch">Krugman</span> (/ˈkrʊɡmən/ KRUUG-mən; born February 28, 1953) is an American New Keynesian economist who is the Distinguished Professor of Economics', 'matched_title': None, 'anchor': None, 'description': 'American economist (born 1953)', 'thumbnail': {'mimetype': 'image/jpeg', 'width': 60, 'height': 80, 'duration': None, 'url': '//upload.wikimedia.org/wikipedia/commons/thumb/9/9c/P20230814AS-0367_%28cropped%29.jpg/60px-P20230814AS-0367_%28cropped%29.jpg'}}]}
https://en.wikipedia.org/wiki/Paul_Krugman
American economist (born 1953)
Paul Krugman


### Known subject with multiple articles (other people with the same name with articles exist)

In [81]:
import requests
language_code = 'en'
search_query = 'James Freeman'
headers = {
    'Authorization': os.getenv("WP_ACCESS_TOKEN"),
}
number_of_results = 1
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': search_query, 'limit': number_of_results}
rs = requests.get(url, headers=headers, params=parameters)

In [82]:
# Get article title, description, and URL from the search results
import json
response = json.loads(rs.text)



for page in response['pages']:
    display_title = page['title']
    key = page.get("key", "")
    article_url = 'https://' + language_code + '.wikipedia.org/wiki/' + page['key']
    try:
        article_description = page['description']
    except:
        article_description = 'a Wikipedia article'

print(response)
print(article_url)
res = 'Multiple matches' if article_description=='Topics referred to by the same term' else article_description
print(res)

{'pages': [{'id': 907661, 'key': 'James_Freeman', 'title': 'James Freeman', 'excerpt': '<span class="searchmatch">James</span> <span class="searchmatch">Freeman</span> or Jim <span class="searchmatch">Freeman</span> may refer to: <span class="searchmatch">James</span> <span class="searchmatch">Freeman</span> (clergyman) (1759–1835), American Unitarian clergyman <span class="searchmatch">James</span> <span class="searchmatch">Freeman</span> (conductor), American musical', 'matched_title': None, 'anchor': None, 'description': 'Topics referred to by the same term', 'thumbnail': None}]}
https://en.wikipedia.org/wiki/James_Freeman
Multiple matches


## TEST3: If author does not exist

In [83]:
import requests
language_code = 'en'
search_query = 'Ilana Masad'
number_of_results = 1
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': search_query, 'limit': number_of_results}
rs = requests.get(url, headers=headers, params=parameters)

In [84]:
# Get article title, description, and URL from the search results
import json

response = json.loads(rs.text)

for page in response['pages']:
    display_title = page['title']
    article_url = 'https://' + language_code + '.wikipedia.org/wiki/' + page['key']
    try:
        article_description = page['description']
    except:
        article_description = 'a Wikipedia article'

print(response)
res = 'no match' if search_query.split(' ')[0] not in response['pages'][0]['key'] else article_description #must apply normalization
print(article_url)
print(res)


{'pages': [{'id': 47199945, 'key': 'A_Little_Life', 'title': 'A Little Life', 'excerpt': 'the enduring grace of friendship&quot;, he concluded. Similarly, in Bustle, <span class="searchmatch">Ilana</span> <span class="searchmatch">Masad</span> wrote that Yanagihara explored &quot;just what the title implies&quot;, which is', 'matched_title': None, 'anchor': None, 'description': '2015 novel by Hanya Yanagihara', 'thumbnail': None}]}
https://en.wikipedia.org/wiki/A_Little_Life
no match


## Joined model

In [106]:
import os
import requests
from rapidfuzz import process, fuzz

BASE_URL = "https://api.wikimedia.org/core/v1/wikipedia"
HEADERS = {
    "Authorization": os.getenv("WP_ACCESS_TOKEN", "")
}
TIMEOUT = 5

def search_description(query: str, lang: str = "en") -> str:
    url = f"{BASE_URL}/{lang}/search/page"
    params = {"q": query, "limit": 10}
    SEARCH_TIMEOUT = 5
    rs = requests.get(url, headers=HEADERS, params=params, timeout=SEARCH_TIMEOUT)
    data = rs.json()

    pages = data.get("pages", [])
    if not pages:
        return "NO_RESULTS"

    # detection of disambiguation remains the same for the first result
    if pages[0].get("description") == "Topics referred to by the same term":
        return "MULTIPLE_MATCHES"

    # normalize query
    normalized_query = (
        query.lower()
            .replace(" ", "_")
            .replace(".", "")       # strip dots from initials/suffixes
    )

    # build candidate list from all returned pages
    choices = [
        p["key"].lower().replace(" ", "_")
        for p in pages
    ]

    # fuzzy‐match
    best, score, idx = process.extractOne(
        normalized_query,
        choices,
        scorer=fuzz.ratio
    )

    if score < 50:
        return "NO_MATCH"

    # we accept pages[idx]
    match = pages[idx]
    article_url = f"https://{lang}.wikipedia.org/wiki/{match['key']}"
    return article_url or "NO_ARTICLE"


if __name__ == "__main__":
    queries = ["Paul Krugman", "James Freeman", "Ilana Masad", "Joseph R. Biden Jr"]
    for q in queries:
        result = search_description(q)
        print(f"{q!r}: {result}")


'Paul Krugman': https://en.wikipedia.org/wiki/Paul_Krugman
'James Freeman': MULTIPLE_MATCHES
'Ilana Masad': NO_MATCH
'Joseph R. Biden Jr': https://en.wikipedia.org/wiki/Joe_Biden


Wiki keys for fuzzy threshold <br>

|Threshold | Precision  | Recall|
|----------|------------|-------|
|       10 |     91.00% |100.00%|
|       15 |     91.00% |100.00%|
|       20 |     91.00% |100.00%|
|       25 |     91.00% |100.00%|
|       30 |     91.00% |100.00%|
|       35 |     91.00% |100.00%|
|       40 |     91.00% |100.00%|
|       45 |     91.00% |100.00%|
|       50 |     91.92% | 98.91%|
|       55 |     91.84% | 97.83%|
|       60 |     91.84% | 97.83%|
|       65 |     91.84% | 97.83%|
|       70 |     91.75% | 96.74%|
|       75 |     91.75% | 96.74%|
|       80 |     91.75% | 96.74%|
|       85 |     91.49% | 93.48%|
|       90 |     91.49% | 93.48%|
|       95 |     90.36% | 81.52%|
|      100 |     90.24% | 80.43%|

- Precision is the fraction among the names that the script did match (someone that I have match that was not a match) <br>
- Recall is the fraction from the names that the script should have match but actually found (someone that I have not match that was a match ) <br> <br>
50% seems to be the sweet spot for precision/recall 

In [105]:
#!/usr/bin/env python3
"""
Evaluate fuzzy-match thresholds for Wikipedia name matching.

Fill in `names` and `urls` with your gold-standard test set,
or provide a CSV with columns `name` and `url`.
Handles HTTP timeouts and retries automatically, and caches API responses to avoid rate limits.
"""
import os
import sys
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from rapidfuzz import process, fuzz

# === Configuration ===
WP_TOKEN = os.getenv("WP_ACCESS_TOKEN", "")  # Wikimedia API token if available
BASE_URL = "https://api.wikimedia.org/core/v1/wikipedia"
LANG = "en"
REQUEST_TIMEOUT = 5  # seconds per request
RETRY_TOTAL = 3      # retry attempts per request

# === HTTP Session with Retries ===
session = requests.Session()
retries = Retry(
    total=RETRY_TOTAL,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"]
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.mount("http://", adapter)


def get_candidates(name: str, lang: str = LANG, limit: int = 10) -> list[str]:
    """Fetch up to `limit` page keys from Wikipedia or return [] on error."""
    headers = {"Authorization": WP_TOKEN}
    params = {"q": name, "limit": limit}
    url = f"{BASE_URL}/{lang}/search/page"
    try:
        r = session.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
    except requests.RequestException as e:
        print(f"Warning: failed to fetch '{name}': {e}", file=sys.stderr)
        return []
    data = r.json()
    return [p.get("key", "") for p in data.get("pages", [])]


def normalize(s: str) -> str:
    """Normalize strings: lowercase, underscores for spaces, remove dots."""
    return s.lower().replace(" ", "_").replace(".", "")


def evaluate_thresholds(
    names: list[str],
    urls: list[str],
    thresholds: list[int] = None
) -> list[tuple[int, float, float]]:
    """Compute precision & recall at each fuzzy-score threshold."""
    if thresholds is None:
        thresholds = list(range(10, 101, 5))

    # Ground-truth page keys extracted from URLs
    correct_keys = [url.rstrip("/\n").split("/")[-1] for url in urls]

    # 1) Pre-fetch all candidates once per name to avoid repeated API calls
    cache: dict[str, list[str]] = {}
    for name in names:
        cache[name] = get_candidates(name)
        time.sleep(0.2)  # throttle between calls if needed

    results: list[tuple[int, float, float]] = []
    # 2) Evaluate each threshold using cached candidates
    for t in thresholds:
        tp = fp = fn = 0
        for name, correct in zip(names, correct_keys):
            candidates = cache.get(name, [])
            if not candidates:
                fn += 1
                continue

            norm_cands = [normalize(c) for c in candidates]
            query_norm = normalize(name)

            best, score, idx = process.extractOne(
                query_norm,
                norm_cands,
                scorer=fuzz.ratio
            )
            print(f"Query: {name} | Best match: {best} | Score: {score}")
            if score >= t:
                pred = candidates[idx]
                if pred == correct:
                    tp += 1
                else:
                    fp += 1
            else:
                fn += 1

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        results.append((t, precision, recall))

    return results


df = pd.read_csv('../data/test/test_wikipedia_normalized.csv')
names = df['author_name'].astype(str).tolist()
urls  = df['urls'].astype(str).tolist()

res = evaluate_thresholds(names, urls)
print("Threshold | Precision | Recall")
for t, p, r in res:
    print(f"{t:>9} | {p*100:>9.2f}% | {r*100:>7.2f}%")


Query: Harry Litman | Best match: harry_litman | Score: 100.0
Query: Jamelle Bouie | Best match: jamelle_bouie | Score: 100.0
Query: Samuel Moyn | Best match: samuel_moyn | Score: 100.0
Query: Michelle Goldberg | Best match: michelle_goldberg | Score: 100.0
Query: Peter Singer | Best match: peter_singer | Score: 100.0
Query: Ross Douthat | Best match: ross_douthat | Score: 100.0
Query: Barack Obama | Best match: barack_obama | Score: 100.0
Query: Cathy Young | Best match: cathy_young | Score: 100.0
Query: Paul Krugman | Best match: paul_krugman | Score: 100.0
Query: Oona A. Hathaway | Best match: oona_a_hathaway | Score: 100.0
Query: Earl Ofari Hutchinson | Best match: earl_ofari_hutchinson | Score: 100.0
Query: Schuyler Bailar | Best match: schuyler_bailar | Score: 100.0
Query: M Gessen | Best match: masha_gessen | Score: 80.0
Query: Joseph R. Biden Jr | Best match: joe_biden | Score: 69.23076923076923
Query: Matt Bai | Best match: matt_bai | Score: 100.0
Query: Bina Venkataraman | Be

## TEST5: Getting the page

In [115]:
import requests
language_code = 'en'
headers = {
    'Authorization': os.getenv("WP_ACCESS_TOKEN"),
}
search_query = 'Paul Krugman'
number_of_results = 1
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': search_query, 'limit': number_of_results}
rs = requests.get(url, headers=headers, params=parameters)
result = rs.json()
print(result)


{'pages': [{'id': 313701, 'key': 'Paul_Krugman', 'title': 'Paul Krugman', 'excerpt': '<span class="searchmatch">Paul</span> Robin <span class="searchmatch">Krugman</span> (/ˈkrʊɡmən/ KRUUG-mən; born February 28, 1953) is an American New Keynesian economist who is the Distinguished Professor of Economics', 'matched_title': None, 'anchor': None, 'description': 'American economist (born 1953)', 'thumbnail': {'mimetype': 'image/jpeg', 'width': 60, 'height': 80, 'duration': None, 'url': '//upload.wikimedia.org/wikipedia/commons/thumb/9/9c/P20230814AS-0367_%28cropped%29.jpg/60px-P20230814AS-0367_%28cropped%29.jpg'}}]}


In [118]:
import requests

page = result['pages'][0]['key']
url = 'https://api.wikimedia.org/core/v1/wikipedia/en/page/' + page + '/html'

headers = {
  'Authorization': os.getenv("WP_ACCESS_TOKEN"),
  'User-Agent': 'YOUR_APP_NAME (YOUR_EMAIL_OR_CONTACT_PAGE)'
}

response = requests.get(url, headers=headers)
data = response.text
print(data)

<!DOCTYPE html>
<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://en.wikipedia.org/wiki/Special:Redirect/revision/1289836007"><head prefix="mwr: https://en.wikipedia.org/wiki/Special:Redirect/"><meta charset="utf-8"/><meta property="mw:pageId" content="313701"/><meta property="mw:pageNamespace" content="0"/><link rel="dc:replaces" resource="mwr:revision/1288755398"/><meta property="mw:revisionSHA1" content="60c05902189954ff07fc1409bf02ba16ebd86309"/><meta property="dc:modified" content="2025-05-11T04:30:30.000Z"/><meta property="mw:htmlVersion" content="2.8.0"/><meta property="mw:html:version" content="2.8.0"/><link rel="dc:isVersionOf" href="//en.wikipedia.org/wiki/Paul_Krugman"/><base href="//en.wikipedia.org/wiki/"/><title>Paul Krugman</title><meta property="mw:generalModules" content="ext.phonos.init|ext.cite.ux-enhancements|ext.categoryTree|mediawiki.page.media|ext.tmh.player"/><meta property="mw:moduleStyles" content="ext.phonos.styles|ext.