In [1]:
import requests

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date,
    excluding special pages like 'Special:Search'.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    # Filter out 'Main_Page' and 'Special:' entries
    articles = [
        (article['article'], article['views'])
        for article in data['items'][0]['articles']
        if not article['article'].startswith("Special:") and article['article'] != "Main_Page"
    ]
    return articles[:n]


def get_article_text(title):
    """
    Fetches the full text of a Wikipedia article.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 404:
        return f"No content available for {title}. (404 Error)"
    elif response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    return data.get('extract', 'No content available')

def main():
    n = 1000  # Number of top articles to fetch
    year, month, day = "2024", "05", "01"  # Year, Month, and Day for the date
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    
    for idx, (title, views) in enumerate(popular_articles, start=1):
        print(f"\n#{idx}: {title} ({views} views)")
        print("Fetching article text...")
        text = get_article_text(title)
        print(f"Summary: {text[:500]}..." if text else "Summary: No content available")

if __name__ == "__main__":
    main()


Fetching the top 1000 articles on Wikipedia for 2024-05-01...

#1: International_Workers'_Day (553048 views)
Fetching article text...
Summary: International Workers' Day, also known as Labour Day in some countries and often referred to as May Day, is a celebration of labourers and the working classes that is promoted by the international labour movement and occurs every year on 1 May, or the first Monday in May....

#2: Labour_Day (463629 views)
Fetching article text...
Summary: Labour Day is an annual day of celebration of the achievements of workers. It has its origins in the labour union movement, specifically the eight-hour day movement, which advocated eight hours for work, eight hours for recreation, and eight hours for rest....

#3: Indian_Premier_League (350581 views)
Fetching article text...
Summary: The Indian Premier League, also known as IPL and Tata IPL for sponsorship reasons, is a men's T20 cricket league held annually in India. Founded by the BCCI in 2007, the league fe

KeyboardInterrupt: 

In [31]:
%pip install tqdm

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [11]:
import requests

def get_full_article(title):
    """
    Fetches the full content of a Wikipedia article in wikitext format.
    """
    title = title.replace(" ", "_")  # Replace spaces with underscores for API compatibility
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": title
    }
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project info
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    
    data = response.json()
    pages = data["query"]["pages"]
    page = next(iter(pages.values()))  # Get the first page
    if "revisions" not in page:
        return f"No content available for {title}."
    
    return page["revisions"][0]["slots"]["main"]["*"]

def main():
    article_title = "Artificial intelligence"
    print(f"Fetching full article for: {article_title}")
    content = get_full_article(article_title)
    print(content[:1000])  # Print the first 1000 characters of the article
    #print(content)  # Print the first 1000 characters of the article


if __name__ == "__main__":
    main()


Fetching full article for: Artificial intelligence
{{Short description|Intelligence of machines}}
{{Redirect|AI|other uses|AI (disambiguation)|and|Artificial intelligence (disambiguation)}}<!-- related -->
{{Use dmy dates|date=July 2023}}{{Pp|small=yes}}<!-- details only a Wikipedian could love -->
{{Artificial intelligence}}<!-- portal -->

<!-- DEFINITIONS -->
'''Artificial intelligence''' ('''AI'''), in its broadest sense, is [[intelligence]] exhibited by [[machine]]s, particularly [[computer|computer systems]]. It is a [[field of research]] in [[computer science]] that develops and studies methods and [[software]] that enable machines to [[Machine perception|perceive their environment]] and use [[machine learning|learning]] and intelligence to take actions that maximize their chances of achieving defined goals.{{Sfnp|Russell|Norvig|2021|pp=1–4}} Such machines may be called AIs.

<!-- APPLICATIONS -->
High-profile [[applications of AI]] include advanced [[web search engine]]s (e.g.,

Most popular 20k articles

In [12]:
import requests
import time

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]


In [20]:
import requests
import time
import json
import os

def get_most_popular_articles(n, year, month, day):
    """
    Fetches the top N most popular articles on Wikipedia for a given date.
    """
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{year}/{month}/{day}"
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching top articles: {response.status_code}")
    
    data = response.json()
    articles = data['items'][0]['articles'][:n]
    return [(article['article'], article['views']) for article in articles if not article['article'].startswith("Special:") and article['article'] != "Main_Page"]

def fetch_articles_in_batches(titles_batch):
    """
    Fetches up to 50 articles in a single API call.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles_batch)  # Batch up to 50 titles
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching articles: {response.status_code}")
    return response.json()

def save_to_hdfs(data, file_path):
    """
    Saves the data to a file formatted for HDFS ingestion.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        for article in data:
            json.dump(article, f, ensure_ascii=False)
            f.write("\n")  # Write each article as a separate JSON line

def main():
    n = 20  # Number of top articles to fetch
    year, month, day = "2024", "10", "01"
    output_file = "hdfs_data/wikipedia_articles"+year+month+day+"small.jsonl"
    total_saved = 0  # Track the total number of articles saved
    
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    popular_articles = get_most_popular_articles(n, year, month, day)
    print(len(popular_articles))
    all_titles = [title for title, views in popular_articles]
    batch_size = 50  # Max 50 titles per batch
    batches = [all_titles[i:i + batch_size] for i in range(0, len(all_titles), batch_size)]

    for idx, batch in enumerate(batches, start=1):
        print(f"Fetching batch {idx}/{len(batches)}...")
        try:
            data = fetch_articles_in_batches(batch)
            processed_data = []
            for page_id, page_info in data["query"]["pages"].items():
                title = page_info["title"]
                if "revisions" in page_info:
                    text = page_info["revisions"][0]["slots"]["main"]["*"]
                else:
                    text = "No content available"
                processed_data.append({"title": title, "text": text})
            save_to_hdfs(processed_data, output_file)
            total_saved += len(processed_data)
        except Exception as e:
            print(f"Failed to fetch batch {idx}: {e}")
        time.sleep(1)  # Wait 1 second between batches

    print(f"Total articles saved: {total_saved}")

if __name__ == "__main__":
    main()


Fetching the top 20 articles on Wikipedia for 2024-10-01...
18
Fetching batch 1/1...
Total articles saved: 18


# Getting revisions

In [None]:
import requests
import time
import json
import os

def get_article_revisions(title, older_than=None):
    """
    Fetches revisions for a given article.
    """
    url = f'https://api.wikimedia.org/core/v1/wikipedia/en/page/{title}/history'
    parameters = {}
    if older_than:
        parameters['older_than'] = older_than
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers, params=parameters)
    if response.status_code != 200:
        raise Exception(f"Error fetching revisions for {title}: {response.status_code}")
    data = response.json()
    return data.get("revisions", [])

def loop_through_revisions(title, from_date=None, olderThanId=None):
    """
    Loops through revisions for an article until a specific timestamp is reached.
    """
    revisions = []
    while True:
        new_revisions = get_article_revisions(title, older_than=olderThanId)
        if not new_revisions:
            break
        revisions.extend(new_revisions)
        if new_revisions[-1]["timestamp"] <= from_date:
            break
        olderThanId = new_revisions[-1]["id"]
    print(f"Gathered {len(revisions)} revisions for article: {title}")
    return revisions

def get_revision_count(title):
    """
    Fetches the total revision count for an article.
    """
    url = f'https://api.wikimedia.org/core/v1/wikipedia/en/page/{title}/history/counts/edits'
    headers = {
        'User-Agent': 'WikiSandbox/ManagingBigData'  # Replace with your project and email
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Error fetching revision count for {title}: {response.status_code}")
    return response.json()

def save_revisions_to_hdfs(revisions, filename):
    """
    Saves the revisions in an HDFS-compatible format (JSON Lines).
    """
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "a", encoding="utf-8") as f:
        for revision in revisions:
            json.dump(revision, f, ensure_ascii=False)
            f.write("\n")  # Each revision is a separate JSON line

def main():
    n = 100  # Number of top articles to fetch
    year, month, day = "2024", "05", "01"
    revisions_from = "2024-04-30T13:16:57Z"
    output_dir = "hdfs_data/revisions_per_top_article"
    print(f"Fetching the top {n} articles on Wikipedia for {year}-{month}-{day}...")
    
    # Use your existing `get_most_popular_articles` function
    popular_articles = get_most_popular_articles(n, year, month, day)
    
    for idx, (title, views) in enumerate(popular_articles, start=1):
        print(f"\n#{idx}: {title} ({views} views)")
        
        # Fetch revision count
        revs = get_revision_count(title)
        print(f"Total revisions: {revs['count']}")

        # Fetch revisions
        print(f"Fetching article revisions... after {revisions_from}")
        revisions = loop_through_revisions(title, from_date=revisions_from)
        
        # Save revisions to HDFS-compatible format
        output_file = os.path.join(output_dir, f"{title}_revisions.jsonl")
        save_revisions_to_hdfs(revisions, output_file)
        print(f"Revisions saved to: {output_file}")

if __name__ == "__main__":
    main()


In [35]:
import requests
from tqdm import tqdm

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

cats = [
    "Category:Russo-Ukrainian War",
    "Category:2010s conflicts",
    "Category:2020s conflicts",
    "Category:2010s in Russia",
    "Category:2010s in Ukraine",
    "Category:2020s in Russia",
    "Category:2020s in Ukraine",
    "Category:Conflicts in Ukraine",
    "Category:Invasions by Russia",
    "Category:Invasions of Ukraine",
    "Category:Wars involving Russia",
    "Category:Wars involving Ukraine",
    "Category:Conflicts in territory of the former Soviet Union",
    "Category:Russia–Ukraine military relations",
    "Category:Russian irredentism",
    "Category:Vladimir Putin",
    "Category:Petro Poroshenko",
    "Category:Volodymyr Zelenskyy",
    "Category:21st-century military history of Russia",
    "Category:21st-century military history of Ukraine"
]
articleTitles = []
def fetch_articles_from_categories(categories):
    """
    Fetches article titles from specified Wikipedia categories.
    
    Args:
        categories (list): List of Wikipedia category names
        
    Returns:
        list: List of unique article titles (excluding category pages)
    """
    article_titles = []
    
    for category in tqdm(categories, desc="Processing categories"):
        if category.startswith("Category:"):
            params = {
                "action": "query",
                "cmtitle": category,
                "cmlimit": "max",
                "list": "categorymembers",
                "format": "json"
            }
        else:    
            params = {
                "action": "query",
                "cmtitle":"Category:" + category,
                "cmlimit": len(article_titles),
                "list": "categorymembers",
                "format": "json"
            }

        response = S.get(url=URL, params=params)
        data = response.json()
        if "query" not in data.keys():
                continue
        else:
            pages = data['query']['categorymembers']
            # print(f"Found {len(pages)} articles in {category}")
            
            for page in pages:
                article_titles.append(page['title'])
                # print("current article titles nr: ", len(article_titles))
            

    # Remove duplicates and filter out category pages
    correct_titles = [title for title in set(article_titles) if not title.startswith("Category:")]
    print(f"\nTotal unique articles found: {len(correct_titles) }")
    
    return correct_titles

# Call the function and store results
correct_titles = fetch_articles_from_categories(cats)

# Print or use the filtered list





Processing categories: 100%|██████████| 20/20 [00:02<00:00,  7.51it/s]


Total unique articles found: 462





In [None]:
import json


# Path to your .jsonl file
file_path = "hdfs_data/New_wikipedia_articles.jsonl"


# Initialize a set to store unique words starting with "Category:"
categories = set()


# Read the .jsonl file
with open(file_path, 'r') as file:
   print("it opened a file")
   for line in file:
       # Parse each line as JSON
       data = json.loads(line)
       # Find words in all values (assuming string content)
       for key, value in data.items():
           if isinstance(value, str):  # Check if the value is a string
               wordies = value.split("{")
               for words in wordies:
                   words = words.split("[")
               categories.update(word.split(":")[1].replace(']','') for word in words if "Category:" in word)
               
               


# Convert set to list and sort (optional)
categories = sorted(categories)


# Print or use the result
print(categories[50])

print(len(categories))
goodOne = set(categories)
print(len(goodOne))




In [40]:
import requests
import time
import json
import os

def fetch_articles_in_batches(titles_batch):
    """
    Fetches up to 50 articles in a single API call.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": "|".join(titles_batch)  # Batch up to 50 titles
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching articles: {response.status_code}")
    return response.json()

def fetch_article(title):
    """
    Fetches a single article by title.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json",
        "titles": title
    }
    headers = {"User-Agent": "WikiSandbox/ManagingBigData"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching article {title}: {response.status_code}")
    return response.json()

def save_to_hdfs(data, file_path):
    """
    Saves the data to a file formatted for HDFS ingestion.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "a", encoding="utf-8") as f:
        for article in data:
            json.dump(article, f, ensure_ascii=False)
            f.write("\n")  # Write each article as a separate JSON line

def fetch_and_save_articles(titles, output_file, batch_mode=True):
    """
    Fetches full text for a list of articles and saves them in an HDFS-compatible format.

    Args:
        titles (list): List of article titles to fetch.
        output_file (str): Path to the output file.
        batch_mode (bool): Whether to use batch fetching or individual fetching.
    """
    total_saved = 0  # Track the total number of articles saved
    batch_size = 50  # Max 50 titles per batch

    if batch_mode:
        # Process in batches
        batches = [titles[i:i + batch_size] for i in range(0, len(titles), batch_size)]
        for idx, batch in enumerate(batches, start=1):
            print(f"Fetching batch {idx}/{len(batches)}...")
            try:
                data = fetch_articles_in_batches(batch)
                processed_data = []
                for page_id, page_info in data["query"]["pages"].items():
                    title = page_info["title"]
                    if "revisions" in page_info:
                        text = page_info["revisions"][0]["slots"]["main"]["*"]
                    else:
                        text = "No content available"
                    processed_data.append({"title": title, "text": text})
                save_to_hdfs(processed_data, output_file)
                total_saved += len(processed_data)
            except Exception as e:
                print(f"Failed to fetch batch {idx}: {e}")
            time.sleep(1)  # Wait 1 second between batches
    else:
        # Process individually
        for idx, title in enumerate(titles, start=1):
            print(f"Fetching article {idx}/{len(titles)}: {title}...")
            try:
                data = fetch_article(title)
                processed_data = []
                for page_id, page_info in data["query"]["pages"].items():
                    if "revisions" in page_info:
                        text = page_info["revisions"][0]["slots"]["main"]["*"]
                    else:
                        text = "No content available"
                    processed_data.append({"title": page_info["title"], "text": text})
                save_to_hdfs(processed_data, output_file)
                total_saved += 1
            except Exception as e:
                print(f"Failed to fetch article {title}: {e}")
            time.sleep(1)  # Wait 1 second between requests

    print(f"Total articles saved: {total_saved}")

def main():
    titles = ["Python_(programming_language)", "Artificial_intelligence", "Machine_learning"]  # Example list of titles
    output_file = "hdfs_data/New_wikipedia_articles.jsonl"
    batch_mode = False  # Set to False to disable batch fetching
    titles = goodOne
    print(f"Fetching articles in {'batch' if batch_mode else 'individual'} mode...")
    fetch_and_save_articles(titles, output_file, batch_mode=batch_mode)

if __name__ == "__main__":
    main()


NameError: name 'goodOne' is not defined

In [4]:
import json

def extract_categories_from_jsonl(file_path):
    """
    Extracts unique categories from a JSONL file containing Wikipedia articles.
    
    Args:
        file_path (str): Path to the JSONL file
        
    Returns:
        set: Set of unique category names
    """
    categories = set()
    
    with open(file_path, 'r') as file:
        print("Reading file...")
        for line in file:
            data = json.loads(line)
            for key, value in data.items():
                if isinstance(value, str):
                    wordies = value.split("{")
                    for words in wordies:
                        words = words.split("[")
                    categories.update(word.split(":")[1].replace(']', '') 
                                   for word in words if "Category:" in word)
    
    return categories

# Example usage:
file_path = "hdfs_data/New_wikipedia_articles.jsonl"
categories = extract_categories_from_jsonl(file_path)
#write json to file
with open('hdfs_data/categories.json', 'w') as f:
    json.dump(list(categories), f, indent=2)  # Convert set to list and add indentation for readability

# Print results
print(f"Total unique categories found: {len(categories)}")
print(f"Sample category: {next(iter(categories))}")


Reading file...
Total unique categories found: 3175
Sample category: Russian irredentism



In [36]:
#read json file to retrieve categories
with open('hdfs_data/categories.json', 'r') as f:
    categories = json.load(f)

# Clean up category names by removing newlines and any trailing characters
goodCats = [cat.strip() for cat in categories]
# print("First 10 cleaned categories:")




# Fetch articles using the cleaned categories
titles = fetch_articles_from_categories(goodCats)
print(len(titles))


Processing categories: 100%|██████████| 3175/3175 [07:38<00:00,  6.92it/s]


Total unique articles found: 67297
67297





In [38]:
with open('hdfs_data/titlesUkr.json', 'w') as f:
    json.dump(list(set(titles)), f, indent=2)  # Convert set to list and add indentation for readability

In [41]:
fetch_and_save_articles(titles, "hdfs_data/ukr_articles.jsonl", batch_mode=True)

Fetching batch 1/1346...
Fetching batch 2/1346...
Fetching batch 3/1346...
Fetching batch 4/1346...
Fetching batch 5/1346...
Fetching batch 6/1346...
Fetching batch 7/1346...
Fetching batch 8/1346...
Fetching batch 9/1346...
Fetching batch 10/1346...
Fetching batch 11/1346...
Fetching batch 12/1346...
Fetching batch 13/1346...
Fetching batch 14/1346...
Fetching batch 15/1346...
Fetching batch 16/1346...
Fetching batch 17/1346...
Fetching batch 18/1346...
Fetching batch 19/1346...
Fetching batch 20/1346...
Fetching batch 21/1346...
Fetching batch 22/1346...
Fetching batch 23/1346...
Fetching batch 24/1346...
Fetching batch 25/1346...
Fetching batch 26/1346...
Fetching batch 27/1346...
Fetching batch 28/1346...
Fetching batch 29/1346...
Fetching batch 30/1346...
Fetching batch 31/1346...
Fetching batch 32/1346...
Fetching batch 33/1346...
Fetching batch 34/1346...
Fetching batch 35/1346...
Fetching batch 36/1346...
Fetching batch 37/1346...
Fetching batch 38/1346...
Fetching batch 39/134