# Wikipedia API basic test
The purpose of this notebook is to test wikipedia's API capabilities.

## Configuration

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()


## TEST1: Calling today's featured article

In [None]:
# Get today's date in YYYY/MM/DD format.
import datetime

today = datetime.datetime.now()
date = today.strftime('%Y/%m/%d')

# Choose your language, and get today's featured content.
import requests

language_code = 'en' # English
headers = {
    'Authorization': 'Bearer ' + os.getenv("WP_ACCESS_TOKEN"),
    'User-Agent': 'juancreyes201@gmail.com'
}

base_url = 'https://api.wikimedia.org/feed/v1/wikipedia/'
url = base_url + language_code + '/featured/' + date
response = requests.get(url, headers=headers)

In [3]:
# Get the featured article's title, URL, extract, and thumbnail.
import json
import pandas as pd
response = json.loads(response.text)

display_title = response['tfa']['titles']['display']
desktop_url = response['tfa']['content_urls']['desktop']['page']
extract_html = response['tfa']['extract_html']
thumbnail_url = response['tfa']['thumbnail']['source']
data = {
    'display_title': display_title,
    'desktop_url': desktop_url,
    'extract_html': extract_html,
    'thumbnail_url': thumbnail_url
}
df = pd.DataFrame(data, index=[0])
display(df)

Unnamed: 0,display_title,desktop_url,extract_html,thumbnail_url
0,"<span class=""mw-page-title-main"">Tesla Model S...",https://en.wikipedia.org/wiki/Tesla_Model_S,<p>The <b>Tesla Model S</b> is a battery-elect...,https://upload.wikimedia.org/wikipedia/commons...


## TEST2: Obtain a subject using API

### Known subject with article

In [72]:
import requests
language_code = 'en'
headers = {
    'Authorization': os.getenv("WP_ACCESS_TOKEN"),
}
search_query = 'Paul Krugman'
number_of_results = 1
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': search_query, 'limit': number_of_results}
rs = requests.get(url, headers=headers, params=parameters)
print(rs.text)

{"pages":[{"id":313701,"key":"Paul_Krugman","title":"Paul Krugman","excerpt":"<span class=\"searchmatch\">Paul</span> Robin <span class=\"searchmatch\">Krugman</span> (/ˈkrʊɡmən/ KRUUG-mən; born February 28, 1953) is an American New Keynesian economist who is the Distinguished Professor of Economics","matched_title":null,"anchor":null,"description":"American economist (born 1953)","thumbnail":{"mimetype":"image/jpeg","width":60,"height":80,"duration":null,"url":"//upload.wikimedia.org/wikipedia/commons/thumb/9/9c/P20230814AS-0367_%28cropped%29.jpg/60px-P20230814AS-0367_%28cropped%29.jpg"}}]}


In [75]:
# Get article title, description, and URL from the search results
import json

response = rs.json()
for page in response['pages']:
    display_title = page['title']
    article_url = 'https://' + language_code + '.wikipedia.org/wiki/' + page['key']

    try:
        article_description = page['description']
    except:
        article_description = 'a Wikipedia article'
    try:
        thumbnail_url = 'https:' + page['thumbnail']['url']
    except:
        thumbnail_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/80/Wikipedia-logo-v2.svg/200px-Wikipedia-logo-v2.svg.png'
#Testing status code id
print(rs.status_code == 200)

# Print the results
print(response)
print(article_url)
print(article_description)


True
{'pages': [{'id': 313701, 'key': 'Paul_Krugman', 'title': 'Paul Krugman', 'excerpt': '<span class="searchmatch">Paul</span> Robin <span class="searchmatch">Krugman</span> (/ˈkrʊɡmən/ KRUUG-mən; born February 28, 1953) is an American New Keynesian economist who is the Distinguished Professor of Economics', 'matched_title': None, 'anchor': None, 'description': 'American economist (born 1953)', 'thumbnail': {'mimetype': 'image/jpeg', 'width': 60, 'height': 80, 'duration': None, 'url': '//upload.wikimedia.org/wikipedia/commons/thumb/9/9c/P20230814AS-0367_%28cropped%29.jpg/60px-P20230814AS-0367_%28cropped%29.jpg'}}]}
https://en.wikipedia.org/wiki/Paul_Krugman
American economist (born 1953)


### Known subject with multiple articles (other people with the same name with articles exist)

In [81]:
import requests
language_code = 'en'
search_query = 'James Freeman'
headers = {
    'Authorization': os.getenv("WP_ACCESS_TOKEN"),
}
number_of_results = 1
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': search_query, 'limit': number_of_results}
rs = requests.get(url, headers=headers, params=parameters)

In [82]:
# Get article title, description, and URL from the search results
import json
response = json.loads(rs.text)



for page in response['pages']:
    display_title = page['title']
    key = page.get("key", "")
    article_url = 'https://' + language_code + '.wikipedia.org/wiki/' + page['key']
    try:
        article_description = page['description']
    except:
        article_description = 'a Wikipedia article'

print(response)
print(article_url)
res = 'Multiple matches' if article_description=='Topics referred to by the same term' else article_description
print(res)

{'pages': [{'id': 907661, 'key': 'James_Freeman', 'title': 'James Freeman', 'excerpt': '<span class="searchmatch">James</span> <span class="searchmatch">Freeman</span> or Jim <span class="searchmatch">Freeman</span> may refer to: <span class="searchmatch">James</span> <span class="searchmatch">Freeman</span> (clergyman) (1759–1835), American Unitarian clergyman <span class="searchmatch">James</span> <span class="searchmatch">Freeman</span> (conductor), American musical', 'matched_title': None, 'anchor': None, 'description': 'Topics referred to by the same term', 'thumbnail': None}]}
https://en.wikipedia.org/wiki/James_Freeman
Multiple matches


### TEST3: If author does not exist

In [83]:
import requests
language_code = 'en'
search_query = 'Ilana Masad'
number_of_results = 1
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': search_query, 'limit': number_of_results}
rs = requests.get(url, headers=headers, params=parameters)

In [84]:
# Get article title, description, and URL from the search results
import json

response = json.loads(rs.text)

for page in response['pages']:
    display_title = page['title']
    article_url = 'https://' + language_code + '.wikipedia.org/wiki/' + page['key']
    try:
        article_description = page['description']
    except:
        article_description = 'a Wikipedia article'

print(response)
res = 'no match' if search_query.split(' ')[0] not in response['pages'][0]['key'] else article_description #must apply normalization
print(article_url)
print(res)


{'pages': [{'id': 47199945, 'key': 'A_Little_Life', 'title': 'A Little Life', 'excerpt': 'the enduring grace of friendship&quot;, he concluded. Similarly, in Bustle, <span class="searchmatch">Ilana</span> <span class="searchmatch">Masad</span> wrote that Yanagihara explored &quot;just what the title implies&quot;, which is', 'matched_title': None, 'anchor': None, 'description': '2015 novel by Hanya Yanagihara', 'thumbnail': None}]}
https://en.wikipedia.org/wiki/A_Little_Life
no match


### Joined model

In [None]:
import os
import requests

BASE_URL = "https://api.wikimedia.org/core/v1/wikipedia"
HEADERS = {
    "Authorization": os.getenv("WP_ACCESS_TOKEN", "")
}
TIMEOUT = 5

def search_description(query: str, lang: str = "en") -> str:
    url = f"{BASE_URL}/{lang}/search/page"
    params = {"q": query, "limit": 1}

    try:
        rs = requests.get(url, headers=HEADERS, params=params, timeout=TIMEOUT)
        rs.raise_for_status()
        data = rs.json()
    except requests.HTTPError:
        return "HTTP error"
    except requests.RequestException:
        return "Network error"
    except ValueError:
        return "Invalid JSON"

    pages = data.get("pages", [])
    if not pages:
        return "no results"

    page = pages[0]
    key = page.get("key", "")
    desc = page.get("description", "")

    # Disambiguation detection
    if desc == "Topics referred to by the same term":
        return "Multiple matches"

    # Exact-match detection
    normalized_key   = key.lower().replace(" ", "_")
    normalized_query = query.lower().replace(" ", "_")
    if normalized_key != normalized_query:
        return "no exact match"

    # Fallback to whatever description we got (or a default)
    return desc or "No description available"


if __name__ == "__main__":
    queries = ["Paul Krugman", "James Freeman", "Ilana Masad"]
    for q in queries:
        result = search_description(q)
        print(f"{q!r}: {result}")


'Paul Krugman ': no exact match
'James Freeman': Multiple matches
'Ilana Masad': no exact match
