# TODO: Recompile the specifically preprocessed datasets in another subfolder


# Supervised Machine Learning - specific Preprocessing
This notebook contains code to preprocess the articles in a way specifically tailored to supervised machine learning.
Since SML models in this project use term frequency - inverse document frequency, we stem the words to their lexical root form and get rid of unwanted noise.

In [1]:
from src import data
import polars as pl
import nltk

data_directory = '../../../data/datasets/03_pubmed'
datasets = data.dict_from_directory(data_directory, type='polars')

In [2]:
from bs4 import BeautifulSoup

def remove_html(text: str)-> str:
    """Remove html tags from a string
    
    Args:
    text: str: a string containing html tags

    Returns:
    str: a string without html tags
    """
    return BeautifulSoup(text, 'html.parser').get_text()

In [3]:
import re

def remove_special_characters(text:str, digits_also:bool=True,) -> str:
    """Remove special characters from a string
    
    Args:
    text: str: a string containing special characters

    Returns:
    str: a string without special characters
    """
    expression = '[^ A-Za-z]+' if digits_also else '[^ A-Za-z0-9]+'

    specials_removed = re.sub(expression, ' ', text)

    return ' '.join(word.strip() for word in specials_removed.split())

In [4]:
input = 'This! is. a, test?Next word'

remove_special_characters(input)

'This is a test Next word'

In [5]:
from autocorrect import Speller

def correct_spelling(text:str) -> str:
    """Correct the spelling of a string
    
    Args:
    text: str: a string containing misspelled words

    Returns:
    str: a string with corrected spelling
    """
    spell = Speller(lang='en')
    return spell(text)

In [6]:
from nltk.stem import PorterStemmer

def stemm_text(text:str) -> str:
    """Stem a text
    
    Args:
    text: str: a string to stem

    Returns:
    str: a stemmed string
    """
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [7]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatize_text(input: str) -> str:
    """Lemmatize a text

    Args:
        input: str: a string to lemmatize

    Returns:
        str: a lemmatized string
    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(input)
    pos_tags = nltk.pos_tag(tokens)
    
    output = []
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)
        if wordnet_pos:
            lemmatized_word = lemmatizer.lemmatize(word, pos=wordnet_pos)
        else:
            lemmatized_word = lemmatizer.lemmatize(word)
        output.append(lemmatized_word)

    return ' '.join(output)


In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatize_wo_pos(input: str) -> str:
    """Lemmatize a text

    Args:
        input: str: a string to lemmatize

    Returns:
        str: a lemmatized string
    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(input)

    output = []
    for word in tokens:
        lemmatized_word = lemmatizer.lemmatize(word)
        output.append(lemmatized_word)

    return ' '.join(output)

In [10]:
from nltk.stem import PorterStemmer

def stemm_text(text:str) -> str:
    """Stem a text
    
    Args:
    text: str: a string to stem

    Returns:
    str: a stemmed string
    """
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [11]:
from num2words import num2words

def digits_to_words(match):
  """
  Convert string digits to the English words. The function distinguishes between
  cardinal and ordinal.
  E.g. "2" becomes "two", while "2nd" becomes "second"

  Input: str
  Output: str
  """
  suffixes = ['st', 'nd', 'rd', 'th']
  # Making sure it's lower cased so not to rely on previous possible actions:
  string = match[0].lower()
  if string[-2:] in suffixes:
    type='ordinal'
    string = string[:-2]
  else:
    type='cardinal'

  return num2words(string, to=type)

In [12]:
from nltk.corpus import stopwords

def remove_stop_words(text):
    """
    Remove stopwords.

    Input: str
    Output: str
    """
    stopwords_set = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word not in stopwords_set])

In [13]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

import warnings
warnings.filterwarnings("ignore")

def preprocess_dataframe(dataframe: pl.DataFrame) -> pl.DataFrame:
    """
    Preprocess a dataframe by removing html tags, special characters, spelling mistakes, lemmatizing (using part-of-speech-tags), and removing stopwords.

    Args:
    dataframe: pl.DataFrame: a dataframe with columns 'title' and 'abstract' containing text to preprocess

    Returns:
    pl.DataFrame: a dataframe with  preprocessed text
    """
    return dataframe.with_columns(
        pl.col("title")
        .str.to_lowercase()  # Vectorized operation
        .map_elements(remove_html, return_dtype=pl.String)  # Custom function for complex logic
        .map_elements(lemmatize_text, return_dtype=pl.String)
        .map_elements(remove_special_characters, return_dtype=pl.String)
        .map_elements(lambda x: re.sub(r'\d+(st)?(nd)?(rd)?(th)?', digits_to_words, x), return_dtype=pl.String)
        .map_elements(remove_stop_words, return_dtype=pl.String),
        pl.col('abstract')
        .str.to_lowercase()  # Vectorized operation
        .map_elements(remove_html, return_dtype=pl.String)  # Custom function for complex logic
        .map_elements(lemmatize_text, return_dtype=pl.String)
        .map_elements(remove_special_characters, return_dtype=pl.String)
        .map_elements(lambda x: re.sub(r'\d+(st)?(nd)?(rd)?(th)?', digits_to_words, x), return_dtype=pl.String)
        .map_elements(remove_stop_words, return_dtype=pl.String),
    )

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\root\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [16]:
from tqdm import tqdm

data_directory_preprocessing = '../../../data/datasets/04_preprocessed/supervised'

print('Preprocessing datasets...')
for subject, dataset in tqdm(datasets.items(), total=len(datasets)):
    datasets[subject] = preprocess_dataframe(dataset)

    datasets[subject].write_csv(f'{data_directory_preprocessing}/{subject}_preprocessed_sml.csv')
print('Done!')

Preprocessing datasets...


100%|██████████| 6/6 [05:49<00:00, 58.33s/it] 

Done!



