In [2]:
import requests

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date,
    excluding special pages like 'Special:Search'.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    # Filter out 'Main_Page' and 'Special:' entries
    articles = [
        (article['article'], article['views'])
        for article in data['items'][0]['articles']
        if not article['article'].startswith("Special:") and article['article'] != "Main_Page"
    ]
    return articles[:n]


def get_article_text(title):
    """
    Fetches the full text of a Wikipedia article.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 404:
        return f"No content available for {title}. (404 Error)"
    elif response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    return data.get('extract', 'No content available')

def main():
    n = 1000  # Number of top articles to fetch
    year, month, day = "2024", "05", "01"  # Year, Month, and Day for the date
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    
    for idx, (title, views) in enumerate(popular_articles, start=1):
        print(f"\n#{idx}: {title} ({views} views)")
        print("Fetching article text...")
        text = get_article_text(title)
        print(f"Summary: {text[:500]}..." if text else "Summary: No content available")

if __name__ == "__main__":
    main()


Fetching the top 1000 articles on Wikipedia for 2024-05-01...

#1: International_Workers'_Day (553048 views)
Fetching article text...
Summary: International Workers' Day, also known as Labour Day in some countries and often referred to as May Day, is a celebration of labourers and the working classes that is promoted by the international labour movement and occurs every year on 1 May, or the first Monday in May....

#2: Labour_Day (463629 views)
Fetching article text...
Summary: Labour Day is an annual day of celebration of the achievements of workers. It has its origins in the labour union movement, specifically the eight-hour day movement, which advocated eight hours for work, eight hours for recreation, and eight hours for rest....

#3: Indian_Premier_League (350581 views)
Fetching article text...
Summary: The Indian Premier League, also known as IPL and Tata IPL for sponsorship reasons, is a men's T20 cricket league held annually in India. Founded by the BCCI in 2007, the league fe

KeyboardInterrupt: 

In [None]:
import requests

def get_full_article(title):
    """
    Fetches the full content of a Wikipedia article in wikitext format.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": title
    }
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project info
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    pages = data["query"]["pages"]
    page = next(iter(pages.values()))  # Get the first page
    if "revisions" not in page:
        return f"No content available for {title}."
    
    return page["revisions"][0]["slots"]["main"]["*"]

def main():
    article_title = "Artificial intelligence"
    print(f"Fetching full article for: {article_title}")
    content = get_full_article(article_title)
    print(content[:1000])  # Print the first 1000 characters of the article
    #print(content)  # Print the first 1000 characters of the article


if __name__ == "__main__":
    main()


Most popular 20k articles

In [6]:
import requests
import time

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]


In [30]:
import requests
import time
import json
import os

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]

def fetch_articles_in_batches(titles_batch):
    """
    Fetches up to 50 articles in a single API call.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles_batch)  # Batch up to 50 titles
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching articles: {response.status_code}")
    return response.json()

def save_to_hdfs(data, file_path):
    """
    Saves the data to a file formatted for HDFS ingestion.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        for article in data:
            json.dump(article, f, ensure_ascii=False)
            f.write("\n")  # Write each article as a separate JSON line

def main():
    n = 20000  # Number of top articles to fetch
    year, month, day = "2024", "10", "01"
    output_file = "hdfs_data/wikipedia_articles"+year+month+day+".jsonl"
    total_saved = 0  # Track the total number of articles saved
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    print(len(popular_articles))
    all_titles = [title for title, views in popular_articles]
    all_titles = correctTitles
    batch_size = 50  # Max 50 titles per batch
    batches = [all_titles[i:i + batch_size] for i in range(0, len(all_titles), batch_size)]
    
    for idx, batch in enumerate(batches, start=1):
        print(f"Fetching batch {idx}/{len(batches)}...")
        try:
            data = fetch_articles_in_batches(batch)
            processed_data = []
            for page_id, page_info in data["query"]["pages"].items():
                title = page_info["title"]
                if "revisions" in page_info:
                    text = page_info["revisions"][0]["slots"]["main"]["*"]
                else:
                    text = "No content available"
                processed_data.append({"title": title, "text": text})
            save_to_hdfs(processed_data, output_file)
            total_saved += len(processed_data)
        except Exception as e:
            print(f"Failed to fetch batch {idx}: {e}")
        time.sleep(1)  # Wait 1 second between batches

    print(f"Total articles saved: {total_saved}")

if __name__ == "__main__":
    main()


Fetching the top 20000 articles on Wikipedia for 2024-10-01...
995
Fetching batch 1/10...
Fetching batch 2/10...
Fetching batch 3/10...


KeyboardInterrupt: 

# Getting revisions

In [6]:
import requests
import time
import json
import os

def get_article_revisions(title, older_than=None):
    """
    Fetches revisions for a given article.
    """
    url = f'https://api.wikimedia.org/core/v1/wikipedia/en/page/{title}/history'
    parameters = {}
    if older_than:
        parameters['older_than'] = older_than
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers, params=parameters)
    if response.status_code != 200:
        raise Exception(f"Error fetching revisions for {title}: {response.status_code}")
    data = response.json()
    return data.get("revisions", [])

def loop_through_revisions(title, from_date=None, olderThanId=None):
    """
    Loops through revisions for an article until a specific timestamp is reached.
    """
    revisions = []
    while True:
        new_revisions = get_article_revisions(title, older_than=olderThanId)
        if not new_revisions:
            break
        revisions.extend(new_revisions)
        if new_revisions[-1]["timestamp"] <= from_date:
            break
        olderThanId = new_revisions[-1]["id"]
    print(f"Gathered {len(revisions)} revisions for article: {title}")
    return revisions

def get_revision_count(title):
    """
    Fetches the total revision count for an article.
    """
    url = f'https://api.wikimedia.org/core/v1/wikipedia/en/page/{title}/history/counts/edits'
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Error fetching revision count for {title}: {response.status_code}")
    print(response.json())
    return response.json()

def save_revisions_to_hdfs(revisions, filename):
    """
    Saves the revisions in an HDFS-compatible format (JSON Lines).
    """
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "a", encoding="utf-8") as f:
        for revision in revisions:
            json.dump(revision, f, ensure_ascii=False)
            f.write("\n")  # Each revision is a separate JSON line

def compare_revisions(rev1, rev2):
    url = 'https://api.wikimedia.org/core/v1/wikipedia/en/revision/' + rev1 + '/compare/' + rev2
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    print(response.json())
    if response.status_code != 200:
        raise Exception(f"Error fetching revision comparison: {response.status_code}")
    return response.json()
def main():
    n = 1  # Number of top articles to fetch
    year, month, day = "2024", "05", "01"
    revisions_from = "2024-04-30T13:16:57Z"
    output_dir = "hdfs_data"
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    
    # Use your existing `get_most_popular_articles` function
    popular_articles = get_most_popular_articles(n, year, month, day)
    
    for idx, (title, views) in enumerate(popular_articles, start=1):
        print(f"\n#{idx}: {title} ({views} views)")
        
        # Fetch revision count
        revs = get_revision_count(title)
        print(f"Total revisions: {revs['count']}")

        # Fetch revisions
        print(f"Fetching article revisions... after {revisions_from}")
        revisions = loop_through_revisions(title, from_date=revisions_from)
        print(revisions[0])
        print(revisions[1])
        # Save revisions to HDFS-compatible format
        compare_revisions("1265626085","1261527387")
        output_file = os.path.join(output_dir, f"{title}_revisions.jsonl")
        save_revisions_to_hdfs(revisions, output_file)
        print(f"Revisions saved to: {output_file}")

if __name__ == "__main__":
    main()


Fetching the top 1 articles on Wikipedia for 2024-05-01...

#1: International_Workers'_Day (553048 views)
{'count': 2674, 'limit': False}
Total revisions: 2674
Fetching article revisions... after 2024-04-30T13:16:57Z
Gathered 100 revisions for article: International_Workers'_Day
{'id': 1265626085, 'timestamp': '2024-12-27T21:28:51Z', 'minor': False, 'size': 178623, 'comment': '/* Singapore */', 'user': {'id': 46117955, 'name': 'Karttapelimies'}, 'delta': 10}
{'id': 1261527387, 'timestamp': '2024-12-06T15:49:39Z', 'minor': True, 'size': 178613, 'comment': 'Reverted edit by [[Special:Contribs/204.112.138.190|204.112.138.190]] ([[User talk:204.112.138.190|talk]]) to last version by Aadirulez8', 'user': {'id': 27199084, 'name': 'Entranced98'}, 'delta': -36}
{'from': {'id': 1265626085, 'slot_role': 'main', 'sections': [{'level': 2, 'heading': "==Origin==<!-- The map is quite incorrect, and has therefore been commented out until corrected.\n[[File:International Observance of Labour Day.png|t

In [27]:
import requests

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

cats = [
    "Category:Russo-Ukrainian War",
    "Category:2010s conflicts",
    "Category:2020s conflicts",
    "Category:2010s in Russia",
    "Category:2010s in Ukraine",
    "Category:2020s in Russia",
    "Category:2020s in Ukraine",
    "Category:Conflicts in Ukraine",
    "Category:Invasions by Russia",
    "Category:Invasions of Ukraine",
    "Category:Wars involving Russia",
    "Category:Wars involving Ukraine",
    "Category:Conflicts in territory of the former Soviet Union",
    "Category:Russia–Ukraine military relations",
    "Category:Russian irredentism",
    "Category:Vladimir Putin",
    "Category:Petro Poroshenko",
    "Category:Volodymyr Zelenskyy",
    "Category:21st-century military history of Russia",
    "Category:21st-century military history of Ukraine"
]
articleTitles = []
for category in cats:
    PARAMS = {
        "action": "query",
        "cmtitle": category,
        "cmlimit": "100",
        "list": "categorymembers",
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()

    PAGES = DATA['query']['categorymembers']
    print(len(PAGES))
    
    for page in PAGES:
        articleTitles.append(page['title'])
        print(page['title'])
print(len(set(articleTitles)))
#if the title contains "Category: drop it"

correctTitles = [tit for tit in set(articleTitles) if not tit.startswith("Category:")]

# Print or use the filtered list
print(correctTitles)




91
Russo-Ukrainian War
Annexation of Crimea by the Russian Federation
Russian invasion of Ukraine
War in Donbas
Outline of the Russo-Ukrainian War
2014 pro-Russian unrest in Ukraine
2015 Ukraine power grid hack
2024 visits by Viktor Orbán to Russia and China
Anti-Maidan
Best in Hell
Book of Remembrance for Those Who Fell for Ukraine
Bridges in the Russo-Ukrainian War
Confiscation of Russian central bank funds
Crimean consensus
Croatia and the Russo-Ukrainian War
2017 Ukraine ransomware attacks
Environmental impact of the Russian occupation of Crimea
European Deterrence Initiative
Georgia and the Russian invasion of Ukraine
Humanitarian impacts of the Russian invasion of Ukraine
Humanitarian situation during the war in Donbas
Hydraulic warfare
Intermarium (region)
International recognition of the Donetsk People's Republic and the Luhansk People's Republic
International sanctions during the Russian invasion of Ukraine
International sanctions during the Russo-Ukrainian War
Klintsy oil dep

In [28]:
print(len(correctTitles))

save_to_hdfs()

462


Fetching batch 1/10...
Failed to fetch batch 1: [Errno 2] No such file or directory: ''
Fetching batch 2/10...
Failed to fetch batch 2: [Errno 2] No such file or directory: ''
Fetching batch 3/10...


KeyboardInterrupt: 