In [27]:
import requests

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date,
    excluding special pages like 'Special:Search'.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    # Filter out 'Main_Page' and 'Special:' entries
    articles = [
        (article['article'], article['views'])
        for article in data['items'][0]['articles']
        if not article['article'].startswith("Special:") and article['article'] != "Main_Page"
    ]
    return articles[:n]


def get_article_text(title):
    """
    Fetches the full text of a Wikipedia article.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 404:
        return f"No content available for {title}. (404 Error)"
    elif response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    return data.get('extract', 'No content available')

def main():
    n = 1000  # Number of top articles to fetch
    year, month, day = "2024", "05", "01"  # Year, Month, and Day for the date
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    
    for idx, (title, views) in enumerate(popular_articles, start=1):
        print(f"\n#{idx}: {title} ({views} views)")
        print("Fetching article text...")
        text = get_article_text(title)
        print(f"Summary: {text[:500]}..." if text else "Summary: No content available")

if __name__ == "__main__":
    main()


Fetching the top 1000 articles on Wikipedia for 2024-05-01...

#1: International_Workers'_Day (553048 views)
Fetching article text...
Summary: International Workers' Day, also known as Labour Day in some countries and often referred to as May Day, is a celebration of labourers and the working classes that is promoted by the international labour movement and occurs every year on 1 May, or the first Monday in May....

#2: Labour_Day (463629 views)
Fetching article text...
Summary: Labour Day is an annual day of celebration of the achievements of workers. It has its origins in the labour union movement, specifically the eight-hour day movement, which advocated eight hours for work, eight hours for recreation, and eight hours for rest....

#3: Indian_Premier_League (350581 views)
Fetching article text...
Summary: The Indian Premier League, also known as IPL and Tata IPL for sponsorship reasons, is a men's T20 cricket league held annually in India. Founded by the BCCI in 2007, the league fe

KeyboardInterrupt: 

In [None]:
import requests

def get_full_article(title):
    """
    Fetches the full content of a Wikipedia article in wikitext format.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": title
    }
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project info
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    pages = data["query"]["pages"]
    page = next(iter(pages.values()))  # Get the first page
    if "revisions" not in page:
        return f"No content available for {title}."
    
    return page["revisions"][0]["slots"]["main"]["*"]

def main():
    article_title = "Artificial intelligence"
    print(f"Fetching full article for: {article_title}")
    content = get_full_article(article_title)
    print(content[:1000])  # Print the first 1000 characters of the article
    #print(content)  # Print the first 1000 characters of the article


if __name__ == "__main__":
    main()


Most popular 20k articles

In [6]:
import requests
import time

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]


In [26]:
import requests
import time
import json
import os

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]

def fetch_articles_in_batches(titles_batch):
    """
    Fetches up to 50 articles in a single API call.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles_batch)  # Batch up to 50 titles
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching articles: {response.status_code}")
    return response.json()

def save_to_hdfs(data, file_path):
    """
    Saves the data to a file formatted for HDFS ingestion.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        for article in data:
            json.dump(article, f, ensure_ascii=False)
            f.write("\n")  # Write each article as a separate JSON line

def main():
    n = 20000  # Number of top articles to fetch
    year, month, day = "2024", "10", "01"
    output_file = "hdfs_data/wikipedia_articles"+year+month+day+".jsonl"
    total_saved = 0  # Track the total number of articles saved
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    print(len(popular_articles))
    all_titles = [title for title, views in popular_articles]
    batch_size = 50  # Max 50 titles per batch
    batches = [all_titles[i:i + batch_size] for i in range(0, len(all_titles), batch_size)]

    for idx, batch in enumerate(batches, start=1):
        print(f"Fetching batch {idx}/{len(batches)}...")
        try:
            data = fetch_articles_in_batches(batch)
            processed_data = []
            for page_id, page_info in data["query"]["pages"].items():
                title = page_info["title"]
                if "revisions" in page_info:
                    text = page_info["revisions"][0]["slots"]["main"]["*"]
                else:
                    text = "No content available"
                processed_data.append({"title": title, "text": text})
            save_to_hdfs(processed_data, output_file)
            total_saved += len(processed_data)
        except Exception as e:
            print(f"Failed to fetch batch {idx}: {e}")
        time.sleep(1)  # Wait 1 second between batches

    print(f"Total articles saved: {total_saved}")

if __name__ == "__main__":
    main()


Fetching the top 20000 articles on Wikipedia for 2024-10-01...
995
Fetching batch 1/20...


KeyboardInterrupt: 

# Getting revisions

In [69]:
import requests
import time



def get_article_revisions(title, older_than=None):  
    url = 'https://api.wikimedia.org/core/v1/wikipedia/en/page/' + title + '/history'
    parameters = {}
    if older_than:
        parameters['older_than'] = older_than
    headers = {
            'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
        }
    response = requests.get(url, headers=headers, params=parameters)
    data = response.json()
    return data["revisions"] 

def loop_through_revisions(title, from_date=None, olderThanId=None):
    revisions = []
    while True:
        new_revisions = get_article_revisions(title, older_than=olderThanId)
        if not new_revisions:
            break
        revisions.extend(new_revisions)
        if new_revisions[-1]["timestamp"] <= from_date:
            break
        olderThanId = new_revisions[-1]["id"]
    print(f"Gathered {len(revisions)} for article: {title} ")
    return revisions
        
    
def get_revision_count(title):
    url = 'https://api.wikimedia.org/core/v1/wikipedia/en/page/' + title + '/history/counts/edits'
    headers = {
            'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
        }
    response = requests.get(url, headers=headers)
    data = response.json()
    
    return data    

def main():
    n = 1  # Number of top articles to fetch
    year, month, day = "2024", "05", "01"  # Year, Month, and Day for the date
    revisions_from = "2024-04-30T13:16:57Z"
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    
    
    for idx, (title, views) in enumerate(popular_articles, start=1):
        revs = get_revision_count(title)
        print(f"\n#{idx}: {title} ({revs['count']} revisions)")
        print(f"Fetching article revisions... after {revisions_from}")
        
    
        loop_through_revisions(title,from_date=revisions_from)
        
if __name__ == "__main__":
    main()


Fetching the top 1 articles on Wikipedia for 2024-05-01...

#1: International_Workers'_Day (2674 revisions)
Fetching article revisions... after 2024-04-30T13:16:57Z
Gathered 100 for article: International_Workers'_Day 
