In [None]:
import requests

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date,
    excluding special pages like 'Special:Search'.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    # Filter out 'Main_Page' and 'Special:' entries
    articles = [
        (article['article'], article['views'])
        for article in data['items'][0]['articles']
        if not article['article'].startswith("Special:") and article['article'] != "Main_Page"
    ]
    return articles[:n]


def get_article_text(title):
    """
    Fetches the full text of a Wikipedia article.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 404:
        return f"No content available for {title}. (404 Error)"
    elif response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    return data.get('extract', 'No content available')

def main():
    n = 1000  # Number of top articles to fetch
    year, month, day = "2024", "05", "01"  # Year, Month, and Day for the date
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    
    for idx, (title, views) in enumerate(popular_articles, start=1):
        print(f"\n#{idx}: {title} ({views} views)")
        print("Fetching article text...")
        text = get_article_text(title)
        print(f"Summary: {text[:500]}..." if text else "Summary: No content available")

if __name__ == "__main__":
    main()


In [None]:
import requests

def get_full_article(title):
    """
    Fetches the full content of a Wikipedia article in wikitext format.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": title
    }
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project info
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    pages = data["query"]["pages"]
    page = next(iter(pages.values()))  # Get the first page
    if "revisions" not in page:
        return f"No content available for {title}."
    
    return page["revisions"][0]["slots"]["main"]["*"]

def main():
    article_title = "Artificial intelligence"
    print(f"Fetching full article for: {article_title}")
    content = get_full_article(article_title)
    print(content[:1000])  # Print the first 1000 characters of the article
    #print(content)  # Print the first 1000 characters of the article


if __name__ == "__main__":
    main()


Most popular 20k articles

In [6]:
import requests
import time

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]


In [None]:
import requests
import time
import json
import os

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]

def fetch_articles_in_batches(titles_batch):
    """
    Fetches up to 50 articles in a single API call.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles_batch)  # Batch up to 50 titles
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching articles: {response.status_code}")
    return response.json()

def save_to_hdfs(data, file_path):
    """
    Saves the data to a file formatted for HDFS ingestion.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        for article in data:
            json.dump(article, f, ensure_ascii=False)
            f.write("\n")  # Write each article as a separate JSON line

def main():
    n = 20000  # Number of top articles to fetch
    year, month, day = "2024", "10", "01"
    output_file = "hdfs_data/wikipedia_articles"+year+month+day+".jsonl"
    total_saved = 0  # Track the total number of articles saved
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    print(len(popular_articles))
    all_titles = [title for title, views in popular_articles]
    batch_size = 50  # Max 50 titles per batch
    batches = [all_titles[i:i + batch_size] for i in range(0, len(all_titles), batch_size)]

    for idx, batch in enumerate(batches, start=1):
        print(f"Fetching batch {idx}/{len(batches)}...")
        try:
            data = fetch_articles_in_batches(batch)
            processed_data = []
            for page_id, page_info in data["query"]["pages"].items():
                title = page_info["title"]
                if "revisions" in page_info:
                    text = page_info["revisions"][0]["slots"]["main"]["*"]
                else:
                    text = "No content available"
                processed_data.append({"title": title, "text": text})
            save_to_hdfs(processed_data, output_file)
            total_saved += len(processed_data)
        except Exception as e:
            print(f"Failed to fetch batch {idx}: {e}")
        time.sleep(1)  # Wait 1 second between batches

    print(f"Total articles saved: {total_saved}")

if __name__ == "__main__":
    main()


Fetching the top 20000 articles on Wikipedia for 2024-10-01...
[('Pete_Rose', 746388), ('Wikipedia:Featured_pictures', 705925), ('Lyle_and_Erik_Menéndez', 457101), ('Jimmy_Carter', 423985), ('John_Amos', 393012), ('Kris_Kristofferson', 372086), ('Dikembe_Mutombo', 306240), ('Gavin_Creel', 236011), ('Lyle_and_Erik_Menendez', 218549), ('Peter_Dante', 210702), ('Sean_Combs', 196775), ('Deaths_in_2024', 189300), ('Devara:_Part_1', 187502), ('Joker:_Folie_à_Deux', 161936), ('Cleopatra', 143666), ('Hezbollah', 142250), ('Adam_Brody', 125334), ('Ballistic_missile', 121167), ('Lebanon', 111117), ('Maggie_Smith', 103737), ('Iron_Dome', 103216), ('Hassan_Nasrallah', 102275), ('Megalopolis_(film)', 101295), ('Israel', 99829), ('wiki.phtml', 97799), ('The_Substance', 97513), ('Monsters:_The_Lyle_and_Erik_Menendez_Story', 92066), ('Deadpool_&_Wolverine', 90294), ('Ken_Page', 88986), ('Hurricane_Helene', 87423), ('Israel–Hezbollah_conflict_(2023–present)', 85559), ('Claudia_Sheinbaum', 85462), ('Ira