In [1]:
import pandas as pd
import numpy as np
import re
import xml.etree.ElementTree as ET
import mwparserfromhell
from nltk import sent_tokenize

import os
import time

# Wikiextractor 

## Fix WikiExtractor file format

In [2]:
def add_root_if_missing(file_path):
    """
    The file extracted by WikiExtractor have no root element
    Add root element in the file so it can work as xml
    
    file_path: the path of WikiExtractor file
    """

    # read file line by line
    with open(file_path, 'r', encoding='latin1') as file:
        lines = file.readlines()

    # check if the first line contains the root element <documents>
    if not any('<documents>' in line for line in lines):
        # wrap the content with a root element
        corrected_content = f"<documents>\n{''.join(lines)}\n</documents>"
    else:
        # if the root element is already added, use the original content
        corrected_content = ''.join(lines)
        print("Root tag already exists")

    # write the corrected content back to the file
    with open(file_path, 'w', encoding='latin1') as file:
        file.write(corrected_content)

## Get clean text from WikiExtractor file

In [3]:
def read_file_from_wikiextractor(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    articles = {}
    for doc in root.findall('doc'): # find each article
        title = doc.get('title')
        text = doc.text.strip()
        articles[title] = text # create dictionary item based on title
        
    return articles

# Extract Feature Functions

## Get number of reference

In [4]:
def number_of_reference(wiki):
    """
    Get number of reference in each articles
    
    wiki: wiki object
    """
   
    # find ref tag
    references = wiki.filter_tags(matches=lambda node: node.tag == "ref")
    
    ref_name = []
    unique_ref = []
    
    for ref in references:
        if len([att for att in ref.attributes if 'name=\"' in att]): # if reference have name
            attribute = [att for att in ref.attributes if 'name=\"' in att][0] # get the name of reference
            
            if attribute not in ref_name: # if reference name is new
                unique_ref.append(ref)
                ref_name.append(attribute)
        else:
            unique_ref.append(ref)
            
    return len(unique_ref)

## Get number of external and internal links

In [5]:
def number_of_links(wiki):
    ex_links = wiki.filter_external_links()
    wiki_links = wiki.filter_wikilinks()
    
    return len(ex_links), len(wiki_links)

## Get number of tables

In [6]:
def number_of_tables(text):
    # find table in this format {| |}
    pattern = re.compile(r'\{\|.*?\|\}', re.DOTALL) # DOTALL make . also matches \n
    tables = pattern.findall(text)
    return len(tables)

## Get number of formulas

In [7]:
def number_of_formula(wiki):
    formulas = wiki.filter_tags(matches=lambda node: node.tag == "math")
    return len(formulas)

## Get number of images

In [8]:
def number_of_image(text):
    # find image in this format [[File: ]]
    pattern = re.compile(r'\[\[File:(.*?)\]\]')
    images = pattern.findall(text)
    return len(images)

## Get number of sections

In [9]:
def number_of_section(text, prefix):
    # find section in this format<!--== ==> / <!--=== ===> 
    pattern = re.compile(f'(?<!--){prefix}([^=]+){prefix}(?!=)')
    sections = pattern.findall(text)
    sections = [re.sub(r'[^A-Za-z ]', '', section).strip() for section in sections]
    return len(sections), sections

## Get number of paragraphs

In [10]:
def number_of_paragraph(text, total_section):
    paragraph = text.split('\n')
    # find paragraph that is not in this format formula_
    pattern = re.compile(r'formula_\d+')
    filtered_paragraph = [p for p in paragraph if p != "" and not pattern.match(p) and p.strip()[:-1] not in total_section]
    return len(filtered_paragraph)

## Get number of sentences

In [11]:
def number_of_sentence(text, total_section):
    # same as find paragraph
    paragraph = text.split('\n')
    # find paragraph that is not in this format formula_
    pattern = re.compile(r'formula_\d+')
    filtered_paragraph = [p for p in paragraph if p != "" and not pattern.match(p) and p.strip()[:-1] not in total_section]
    
    # separate paragraph by sentence
    filtered_text = ' '.join(filtered_paragraph)
    sentences = sent_tokenize(filtered_text)
    return len(sentences)

In [12]:
def add_features(df):
    """
    Clean Text and add Structure Features
    
    df: dataframe of articles
    """
    
    add_root_if_missing('D:/Leeds/Dissertation/Data/Wiki Dumps/articles/AA/wiki_00')
    
    print("Start Getting Clean Text from extracted Wikiextractor file ....")
    start_time = time.time()
    articles = read_file_from_wikiextractor('D:/Leeds/Dissertation/Data/Wiki Dumps/articles/AA/wiki_00')
    df['clean_text'] = df['title'].map(articles)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Get Clean Text time: {elapsed_time/60:.2f} minutes")
    
    # Remove NaN value from clean text
    print("NaN clean text: ", len(np.where(df['clean_text'].isnull())[0]))
    df = df[df['clean_text'].notna()]
    print("Number of rows: ", df.shape[0])
    
    print("Start Getting Structure Features ....")
    start_time = time.time()
    for index, row in df.iterrows():
        wiki = mwparserfromhell.parse(row['text']) # convert to wiki object
        text = row['text']
        clean_text = row['clean_text']

        df.loc[index, 'reference'] = number_of_reference(wiki)
        
        ex_links, wiki_links = number_of_links(wiki)
        df.loc[index, 'external_link'] = ex_links
        df.loc[index, 'internal_link'] = wiki_links
        
        df.loc[index, 'table'] = number_of_tables(text)
        df.loc[index, 'formula'] = number_of_formula(wiki)
        df.loc[index, 'images'] = number_of_image(text)
    
        # Count number of sections, subsections, sub-subsections based on number of = 
        section_numb, section = number_of_section(text, '==')
        subsection_numb, subsection = number_of_section(text, '===')
        subsubsection_numb, subsubsection = number_of_section(text, '====')
        
        df.loc[index, 'section'] = section_numb
        df.loc[index, 'subsection'] = subsection_numb
        df.loc[index, 'subsubsection'] = subsubsection_numb

        total_section = section + subsection + subsubsection
        df.loc[index, 'paragraph'] = number_of_paragraph(clean_text, total_section)
        df.loc[index, 'sentence'] = number_of_sentence(clean_text, total_section)
        
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Get Structure Features time: {elapsed_time/60:.2f} minutes")
        
    return df

In [13]:
if not os.path.exists('../Data/dataset_text_structure_(Imbalance).csv'):
    df = pd.read_csv('../Data/initial_dataset_(Imbalance).csv', keep_default_na=False)
    
    start_time = time.time()
    df = add_features(df)
    df.to_csv('../Data/dataset_text_structure_(Imbalance).csv', index=False)
    end_time = time.time()
    
    print("CSV file created")
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time/60:.2f} minutes")
else:
    df = pd.read_csv(f'../Data/dataset_text_structure_(Imbalance).csv', keep_default_na=False)
    print("CSV file alreday exists")

CSV file alreday exists


In [14]:
df.shape

(19181, 16)