In [1]:
import pandas as pd
import numpy as np

import requests
import re
import os
import time

In [2]:
ratings = {"Featured": "FA", 
           "Good": "GA", 
           "B-Class": "B", 
           "C-Class": "C", 
           "Start-Class": "Start", 
           "Stub-Class":"Stub"}

In [3]:
def fetch_articles(category, pagination):
    """
    fetch article titles
    
    category: the category of articles to fetch
    pagination: the current page of API
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}_articles",
        "cmlimit": "max"
    }
    
    if pagination:
        params['cmcontinue'] = pagination
    
    response = requests.get(url, params=params).json()
    articles = response['query']['categorymembers']
    
    if 'continue' in response:
        return articles, response['continue']['cmcontinue']
    else:
        return articles, ''

In [4]:
def special_handle_articles(article_list, rating):
    """
    special handle articles that have no rating even after getting rating from API twice
    
    article_list: list of articles without rating
    rating: rating of these articles
    """
    special_article_list = []
    for page_info in article_list:
        title = page_info['title'] if 'title' in page_info else ""
        text = page_info['revisions'][0]['*'] if 'revisions' in page_info else ""
        rate = rating # manually set to rating where the article title is get from
        importance = "Unknown" # classify articles importance as 'Unknown'
        
        if title and text and rate and importance: # if all information is complete
            article_data = {'title': title,
                            'text': text,
                            'rate': rate,
                            'importance': importance}
            special_article_list.append(article_data)
        
    return special_article_list 

In [5]:
def process_article_details(page, rating):
    """
    process single article
    
    page: json return from the API
    rating: rating of this article
    """
    title = page['title'] if 'title' in page else ""
    text = page['revisions'][0]['*'] if 'revisions' in page else ""
    
    # list of assessments/ratings
    assessment_list = list(page['pageassessments'].values()) if 'pageassessments' in page else []

    rate = ""
    importance = ""
    if len(assessment_list) > 0:
        for assessment in assessment_list:
            if assessment['class']==rating and assessment['importance']: # if rate match and importance is not empty
                rate = assessment['class']
                importance = assessment['importance']
                break
            elif assessment['class'] and assessment['importance']: # if rate and importance is not empty
                rate = rating
                importance = assessment['importance']
            elif assessment['class']: # if only rate is not empty
                rate = rating
                importance = "Unknown" # classify articles importance as 'Unknown'

    if title and text and rate and importance: # if all information is complete
        article_data = {'title': title,
                        'text': text,
                        'rate': rate,
                        'importance': importance}
    else:
        article_data = ""
        
    return article_data 

In [6]:
def handle_batch_articles(batch_article_list, rating):
    """
    Handle a single batch of articles
    
    batch_article_list: json return from the API
    rating: rating of these articles
    """
    complete_article_list = []
    no_assessment_article_list = []
    
    for page_info in batch_article_list.values(): # page_info represent an article
        if "pageassessments" not in page_info.keys(): # API doesn't return rating
            no_assessment_article_list.append(page_info)
        else:
            article_data = process_article_details(page_info, rating)
            if article_data:
                complete_article_list.append(article_data)
                
    return complete_article_list, no_assessment_article_list

In [7]:
def fetch_batch_article_details(titles, rating, batch_count = 4, second = False):
    """
    Separate list of articles to batch of 4 
    (4 is maximum where API can return complete information)
    
    
    titles: list of articles titles
    rating: rating of these articles
    second: whether this is inner lopp of function.
    API sometimes doesn't return information even if batch is 4.
    """

    def batches(titles, n = 4):
        """
        Separate list of titles to batch of 4
        
        titles: article titles is split into batches
        """
        for i in range(0, len(titles), n):
            yield titles[i:i+n]
            
    batch_article_list = [] # list of articles with complete information
    further_handle_article_list = [] # list of articles that requires inner loop to get information again
    for batch_titles in batches(titles, batch_count):
        titles_query = "|".join(map(str, batch_titles))
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "format": "json",
            "prop": "revisions|pageassessments",
            "rvprop": "content",
            "titles": titles_query,
        }
        
        response = requests.get(url, params=params).json() 

        # remove error articles from API
        if "-1" in list(response['query']['pages'].keys()):
            temp = response['query']['pages'].pop("-1")
        
        complete_article_list, no_assessment_article_list = handle_batch_articles(response['query']['pages'], rating)
        
        if second:
            # if second loop still have problem then special handle these articles
            batch_article_list.extend(special_handle_articles(no_assessment_article_list, rating))
            no_assessment_article_list = []
        else:
            no_assessment_article_list = [page_info['title'] for page_info in no_assessment_article_list]
        
        batch_article_list.extend(complete_article_list)
        further_handle_article_list.extend(no_assessment_article_list) # append articles without rating for further process
    
    # if there is articles that need further process
    if len(further_handle_article_list) > 0:
        # call self-function to get information again but this time with batch of 2 (2 garanteed API can return complete information)
        batch_article_list.extend(fetch_batch_article_details(further_handle_article_list, rating, 2, True))
        
    return batch_article_list

In [8]:
def create_data(two_class):
    """
    Create initial dataset
    
    two_class: separate articles by 2 classes
    """
    complete_article_list = []
    article_count_per_category = 2000

    # define the number of articles to collect based on rating
    for category, rating in ratings.items():
        if two_class:
            if rating in ["FA", "GA"]:
                article_count_per_category = 4000
            else:
                article_count_per_category = 2000

        current_article_count = 0
        pagination = '' # to fetch articles of next page

        print(f"Loading {category} articles .......")
        while current_article_count < article_count_per_category:
            title_list = []
            articles_per_fetch, pagination = fetch_articles(category, pagination)

            """
            Handle article title
            [B, C, Start, Stub] articles may start with some prefix
            """
            for article in articles_per_fetch:
                if article['title'].startswith(f"Category:{category}"):
                    title_list.append(re.findall(fr"Category:{category} (.*) articles", article['title'])[0])
                elif article['title'].startswith("Talk:"):
                    title_list.append(re.findall(other_pattern, article['title'])[0])
                else:
                    title_list.append(article['title'])

            batch_article_details = fetch_batch_article_details(title_list, rating)
            current_article_count += len(batch_article_details)
            complete_article_list.extend(batch_article_details)
            print(f"Current Articles: {current_article_count}")

    df = pd.DataFrame(complete_article_list)
    if two_class:
        df.to_csv('../Data/initial_dataset_(Balance).csv', index=False)
    else:
        df.to_csv('../Data/initial_dataset_6_class_(Balance).csv', index=False)
    return df

In [9]:
# separate articles by 2 classes [Low, High]
# use to control the number of articles in each rating
two_class = True 

if two_class:
    classes = 2
else:
    classes = 6   

if not os.path.exists(f'../Data/initial_dataset_(Balance).csv'):
    print("Start Scrapping Data from Wikimedia API ....")
    start_time = time.time()
    df = create_data(two_class)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Scrape Data time: {elapsed_time/60:.2f} minutes")
    print("CSV file created")
else:
    df = pd.read_csv(f'../Data/initial_dataset_(Balance).csv', keep_default_na=False)
    print("CSV file alreday exists")

CSV file alreday exists


# Check column characteristics

## Check Rate Column

In [10]:
list(df['rate'].unique())

['FA', 'GA', 'B', 'C', 'Start', 'Stub']

In [11]:
df['rate'].value_counts().reset_index()

Unnamed: 0,rate,count
0,GA,4499
1,FA,4494
2,C,2411
3,B,2370
4,Start,2304
5,Stub,2000


## Check Importance Column

In [12]:
list(df['importance'].unique())

['Unknown', 'Low', 'NA', 'Mid', 'High', 'Top', 'Bottom', 'Related']

In [13]:
df['importance'].value_counts().reset_index()

Unnamed: 0,importance,count
0,Low,5779
1,,5439
2,Mid,2650
3,Top,2354
4,High,1604
5,Unknown,246
6,Bottom,4
7,Related,2
