In [1]:
import requests
from bs4 import BeautifulSoup
import re

import sys
import os
import bz2

from lxml import etree 

import pandas as pd
import numpy as np
import random
import time

# Show Wikimedia dumps on different dates

In [2]:
base_url = "https://dumps.wikimedia.org/enwiki/"
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, "html.parser")

In [3]:
dates = [a['href'] for a in soup_index.find_all('a') if a.has_attr('href')]
dates

['../',
 '20240320/',
 '20240401/',
 '20240420/',
 '20240501/',
 '20240601/',
 '20240620/',
 '20240701/',
 'latest/']

# Shows various dumps on that date

In [4]:
dump_url = base_url + "20240620/"
dump_html = requests.get(dump_url).text
soup_dump = BeautifulSoup(dump_html, 'html.parser')

In [5]:
files = []

for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))

files[:5]

[('enwiki-20240620-pages-articles-multistream.xml.bz2', ['22.1', 'GB']),
 ('enwiki-20240620-pages-articles-multistream-index.txt.bz2', ['249.0', 'MB']),
 ('enwiki-20240620-pages-articles-multistream1.xml-p1p41242.bz2',
  ['269.0', 'MB']),
 ('enwiki-20240620-pages-articles-multistream-index1.txt-p1p41242.bz2',
  ['221', 'KB']),
 ('enwiki-20240620-pages-articles-multistream2.xml-p41243p151573.bz2',
  ['358.6', 'MB'])]

In [6]:
dump_files = [file[0] for file in files if ('.xml-p' in file[0]) and ('rss' not in file[0])]
dump_files[:5]

['enwiki-20240620-pages-articles-multistream1.xml-p1p41242.bz2',
 'enwiki-20240620-pages-articles-multistream2.xml-p41243p151573.bz2',
 'enwiki-20240620-pages-articles-multistream3.xml-p151574p311329.bz2',
 'enwiki-20240620-pages-articles-multistream4.xml-p311330p558391.bz2',
 'enwiki-20240620-pages-articles-multistream5.xml-p558392p958045.bz2']

# Download dump if not exists

In [7]:
folder_path = 'D:/Leeds/Dissertation/Data/Wiki Dumps/'

file = dump_files[0] # Only download the first file
file_path = folder_path + file

response = requests.head(dump_url + file)
file_size = int(response.headers.get('content-length', 0))

if os.path.exists(file_path):
    if os.path.getsize(file_path) == file_size:
        print(f"{file} is already downloaded and complete.")
    else:
        print(f"{file}' is incomplete. Re-downloading...")
        os.remove(file_path)

if not os.path.exists(file_path):
    print('Downloading...')
    with requests.get(dump_url + file, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
                
    if os.path.getsize(file_path) == file_size:
        print(f"{file} downloaded successfully.")
    else:
        print(f"Error: File '{file}' downloaded but is incomplete.")

enwiki-20240620-pages-articles-multistream1.xml-p1p41242.bz2 is already downloaded and complete.


# Extract the downloaded file

In [8]:
output_path = folder_path + file[:-4] # remove the file extension

if not os.path.exists(output_path):
    print('Extracting')
    with bz2.open(file_path, 'rb') as f_in:
        with open(output_path, 'wb') as f_out:
            f_out.write(f_in.read())
else:
    print('Files already exists')

Files already exists


# Extract articles title and text from xml

In [9]:
namespace = '{http://www.mediawiki.org/xml/export-0.11/}'

page_tag = f'{namespace}page'
title_tag = f'{namespace}title'
id_tag = f'{namespace}revision/{namespace}id'
text_tag = f'{namespace}revision/{namespace}text'

def parse_wikipedia_dump(dump_file):
    """
    Extract articles in dump file and convert to list of dictionary
    
    dump_file: file to be extracted
    """
    context = etree.iterparse(dump_file, events=('end',), tag=page_tag)
    articles = []
    
    for event, elem in context:
        if (elem.find(title_tag) is not None) & (elem.find(id_tag) is not None) & (elem.find(text_tag) is not None):
            title = elem.find(title_tag).text
            pageid = elem.find(id_tag).text
            text = elem.find(text_tag).text
        
            articles.append({'title': title, 'pageid': pageid, 'text':text})
        elem.clear()
        
    del context
    return articles

In [10]:
articles = parse_wikipedia_dump(output_path)
print(len(articles), " articles found")

27374  articles found


# Get Wikiproject assessment based on article title

In [11]:
def handle_batch_articles(batch_article_list, articles):
    """
    Handle a single batch of articles
    
    batch_article_list: json return from the API
    articles: list of articles extracted above
    """
    article_no_rating_list = []
    complete_article_list = []
    
    for page_info in batch_article_list.values(): # page_info represent an article
        title = page_info['title'] if 'title' in page_info else ""

        # get the article text based on title
        for article in articles:
            if article['title'] == title:
                text = article['text']
                break

        # list of assessments/ratings
        assessment_list = list(page_info['pageassessments'].values()) if 'pageassessments' in page_info else []
        
        rate = ""
        importance = ""
        if len(assessment_list) > 0:
            for assessment in assessment_list:
                if assessment['class'] and assessment['importance']: # if rate and importance is not empty
                    rate = assessment['class']
                    importance = assessment['importance']
                    break
                elif assessment['class']: # if only rate is not empty
                    rate = assessment['class']
                    importance = "Unknown" # classify articles importance as 'Unknown'
        else:
            article_no_rating_list.append(title)
        
        if title and text and rate and importance: # if all information is complete
            article_data = {'title': title,
                            'text': text,
                            'rate': rate,
                            'importance': importance}
            complete_article_list.append(article_data)
            
    return complete_article_list, article_no_rating_list

In [12]:
def fetch_batch_article_details(articles):
    """
    Separate list of articles to batch of 4 
    (4 is maximum where API can return complete information)
    
    articles: list of articles extracted above
    """
    
    def batches(titles, n = 4):
        """
        Separate list of titles to batch of 4
        
        titles: article titles is split into batches
        """
        for i in range(0, len(titles), n):
            yield titles[i:i+n]
    
    # variables to show processing progress
    threshold_step = 1000
    next_threshold = threshold_step
    
    
    batch_article_list = [] # list of articles with complete information
    batch_article_no_rating_list = [] # list of articles without rating
    
    titles = [article['title'] for article in articles]
    
    for batch_titles in batches(titles):
        titles_query = "|".join(map(str, batch_titles))
        url = "https://en.wikipedia.org/w/api.php"
        params = {"action": "query",
                  "format": "json",
                  "prop": "pageassessments",
                  "rvprop": "content",
                  "titles": titles_query}
        
        response = requests.get(url, params=params).json()
            
        complete_article_list, article_no_rating_list = handle_batch_articles(response['query']['pages'], articles)
        batch_article_list.extend(complete_article_list)
        batch_article_no_rating_list.extend(article_no_rating_list)
        
        # Show progress
        if len(batch_article_list) >= next_threshold:
            next_threshold += threshold_step
            print(f"We are have processed {len(batch_article_list)} articles")
    print(f"Abandoned {len(batch_article_no_rating_list)} articles")
        
    return batch_article_list

In [13]:
def create_data():
    """
    Create initial dataset
    """
    articles_with_ratings = []

    articles_with_ratings.extend(fetch_batch_article_details(articles))

    df = pd.DataFrame(articles_with_ratings)
    df.to_csv('../Data/initial_dataset_(Imbalance).csv', index=False)

    return df

In [14]:
if not os.path.exists('../Data/initial_dataset_(Imbalance).csv'):
    print("Start Scrapping Data from Wikimedia dumps ....")
    start_time = time.time()
    df = create_data()
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Scrape Data time: {elapsed_time/60:.2f} minutes")
    print("CSV file created")
else:
    df = pd.read_csv('../Data/initial_dataset_(Imbalance).csv', keep_default_na=False)
    print("CSV file alreday exists")

CSV file alreday exists


# Check column characteristics

## Check Rate Column

In [15]:
list(df['rate'].unique())

['Redirect',
 'GA',
 'B',
 'C',
 'List',
 'Start',
 'FA',
 'Disambig',
 'Stub',
 'NA',
 'FL',
 'Portal',
 'list',
 'Project',
 'A']

In [16]:
df['rate'].value_counts().reset_index()

Unnamed: 0,rate,count
0,C,5373
1,Start,4440
2,B,3819
3,List,2414
4,Stub,737
5,Disambig,683
6,GA,676
7,Redirect,584
8,FA,381
9,,82


## Check Importance Column

In [17]:
list(df['importance'].unique())

['NA', 'High', 'Low', 'Top', 'Mid', 'Unknown', 'Bottom']

In [18]:
df['importance'].value_counts().reset_index()

Unnamed: 0,importance,count
0,Low,6255
1,Mid,4169
2,High,3610
3,Unknown,2319
4,Top,2177
5,,693
6,Bottom,7
