In [1]:
import pandas as pd
import numpy as np
import re
import xml.etree.ElementTree as ET
import mwparserfromhell

import os
import time

In [2]:
def filter_date(df):
    """
    Remove NaN value from rate, importance and text
    
    df: dataframe of articles
    """
    print("NaN rate: ", len(np.where(df['rate'].isnull())[0]))
    print("NaN importance: ", len(np.where(df['importance'].isnull())[0]))
    print("NaN text: ", len(np.where(df['text'].isnull())[0]))
    
    df = df[df['rate'].notna()]
    df = df[df['importance'].notna()]
    df = df[df['text'].notna()]
    print(f"Number of rows after drop NaN: {df.shape[0]}")

In [3]:
def clean_wikipedia_text(text):
    """
    Clean article text
    
    text: article text
    """
    
    # remove anything after "== See also =="
    see_also_index = text.find("== See also ==")
    if see_also_index != -1:
        text = text[:see_also_index]
    
    wikicode = mwparserfromhell.parse(text) # convert to wiki object
    
    # remove templates
    for template in wikicode.filter_templates(recursive=False):
        wikicode.remove(template)
    
    # convert to plain text
    plain_text = wikicode.strip_code()
    
    plain_text = re.sub(r'thumb\|.*?\.', '', plain_text)
    plain_text = re.sub(r"\\('s)", r"\1", str(plain_text))
    
    # normalize whitespace
    plain_text = re.sub(r'\s+', ' ', plain_text).strip()
    
    return plain_text

# Extract Feature Functions

## Get number of reference

In [4]:
def number_of_reference(text):
    """
    Get number of reference in each articles
    
    text: article text
    """

    # find reference in this format {{Sfn: }} or {{Sfnm: }}
    substrings = list(set(re.findall(r"\{\{Sfn(?:\||m\|)[^}]*\}\}", text)))
    if len(substrings) > 0:
        return len(substrings)
    
    # find reference by using mwparserfromhell
    wiki = mwparserfromhell.parse(text) # convert to wiki object
    references = wiki.filter_tags(matches=lambda node: node.tag == "ref")
    
    ref_name = []
    unique_ref = []
    for ref in references:
        if len([att for att in ref.attributes if 'name=\"' in att]): # if reference have name
            attribute = [att for att in ref.attributes if 'name=\"' in att][0] # get the name of reference
            
            if attribute not in ref_name: # if reference name is new
                unique_ref.append(ref)
                ref_name.append(attribute)
        else:
            unique_ref.append(ref)
            
    return len(unique_ref)

## Get number of external and internal links

In [5]:
def number_of_links(wiki):
    ex_links = wiki.filter_external_links()
    wiki_links = wiki.filter_wikilinks()
    
    return len(ex_links), len(wiki_links)

## Get number of tables

In [6]:
def number_of_tables(wiki):
    tables = wiki.filter_tags(matches=lambda tag: tag.tag == "table")
    return len(tables)

## Get number of formulas

In [7]:
def number_of_formula(wiki):
    formulas = wiki.filter_tags(matches=lambda node: node.tag == "math")
    return len(formulas)

## Get number of images

In [8]:
def number_of_image(wiki):
    images = wiki.filter_wikilinks(matches=lambda link: link.title.startswith("File:"))
    return len(images)

## Get number of paragraph

In [9]:
def number_of_paragraph(text):
    paragraphs = len(re.findall(r'\n\n', text)) + 1
    return paragraphs

## Get number of sentences

In [10]:
def number_of_sentence(text):
    sentences = len(re.split(r'[.!?]', text))
    return sentences

In [11]:
def add_features(df):
    """
    Clean Text and add Structure Features
    
    df: dataframe of articles
    """
    
    print("Start Cleaning Text ....")
    start_time = time.time()
    for index, row in df.iterrows():
        text = row['text']        
        df.loc[index, 'clean_text'] = clean_wikipedia_text(text)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Clean Text time: {elapsed_time/60:.2f} minutes")
    
    # Remove NaN value from clean text
    print("NaN clean text: ", len(np.where(df['clean_text'].isnull())[0]))
    df = df[df['clean_text'].notna()]
    print("Number of rows after drop NaN : ", df.shape[0])
    
    print("Start Getting Structure Features ....")
    start_time = time.time()
    for index, row in df.iterrows():
        wiki = mwparserfromhell.parse(row['text']) # convert to wiki object
        text = row['text']
        
        df.loc[index, 'reference'] = number_of_reference(text)
        
        ex_links, wiki_links = number_of_links(wiki)
        df.loc[index, 'external_link'] = ex_links
        df.loc[index, 'internal_link'] = wiki_links
        
        df.loc[index, 'table'] = number_of_tables(wiki)
        df.loc[index, 'formula'] = number_of_formula(wiki)
        df.loc[index, 'images'] = number_of_image(wiki)

        headings = wiki.filter_headings()

        # Count number of sections, subsections, sub-subsections
        sections = sum(1 for heading in headings if heading.level == 2)
        df.loc[index, 'section'] = sections

        subsections = sum(1 for heading in headings if heading.level == 3)
        df.loc[index, 'subsection'] = subsections

        subsubsections = sum(1 for heading in headings if heading.level == 4)
        df.loc[index, 'subsubsection'] = subsubsections
        
        df.loc[index, 'paragraph'] = number_of_paragraph(text)
        df.loc[index, 'sentence'] = number_of_sentence(text)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Get Structure Features time: {elapsed_time/60:.2f} minutes")
    
    return df

In [12]:
if not os.path.exists('../Data/dataset_text_structure_(Balance).csv'):
    df = pd.read_csv('../Data/initial_dataset_(Balance).csv', keep_default_na=False)\
    
    start_time = time.time()
    df = filter_date(df)
    df = add_features(df)
    df.to_csv('../Data/dataset_text_structure_(Balance).csv', index=False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time/60:.2f} minutes")
    print("CSV file created")
else:
    df = pd.read_csv(f'../Data/dataset_text_structure_(Balance).csv', keep_default_na=False)
    print("CSV file alreday exists")

CSV file alreday exists
