# General Preprocessing
While some preprocessing steps are unique to supervised machine learning models, others apply to all.

Such general preprocessing happens in two steps.

The first step is to remove whole articles that do not contribute to the classification task by:

- Removing articles which appear more than once in one dataset (duplicates)
- Removing non-English abstracts
- Removing long abstracts which contain more than 489 words
- Removing articles without abstracts

The second step is to clean the texts of the remaining articles from noise, namely:

- Removing HTML
- Removing non-character symbols, such as digits and special characters

# Preprocessing
## Preparation
### Imports

In [1]:
from src import data # helper functions
import polars as pl # dataframes
import re # regular expressions

### Datasets

In [2]:
# Define the directory where the data is stored
directory = '../../../../data/datasets/03_pubmed'

# Output directory
output_directory = '../../../../data/datasets/04_preprocessed'

# Load the data
datasets = data.dict_from_directory(directory, type='polars', with_index='index')

# individual datasets
adhd = datasets['adhd']
animal_depression = datasets['animal_depression']
atypical_antipsychotics = datasets['atypical_antipsychotics']
calcium_channel_blockers = datasets['calcium_channel_blockers']
oral_hypoglycemics = datasets['oral_hypoglycemics']
pancreatic_surgery = datasets['pancreatic_surgery']

# combine all datasets
all_datasets = pl.DataFrame(
)

for subject, dataset in datasets.items():
    all_datasets = all_datasets.vstack(dataset.with_columns(pl.lit(subject).alias('dataset')))

## Removal of Abstracts and Articles
Remove non-English abstracts and abstracts with more than 489 words.

Afterwards, remove articles without an abstract, as those do not contain sufficient information to decide on inclusion or exclusion in title/abstract-screening.

### Duplicates

In [5]:
# columns with duplicates known from manual inspection of the data
known_duplicates = {
    'animal_depression': ['doi', 'openalex_id', 'pubmed_id'],
    'pancreatic_surgery': ['doi', 'pubmed_id', 'webofscience_id']
}

for subject, dataset in datasets.items():
    if subject in known_duplicates.keys():
        for column in known_duplicates[subject]:

            # save nulls as they are lost to .unique()
            null = datasets[subject].filter(
                pl.col(column).is_null()
            )

            # drop duplicate values - also drops null values
            unique = datasets[subject].filter(
                pl.col(column).is_unique()
            )

            # combine the unique and null values
            filtered = unique.vstack(null)

            # override the dataset with the filtered dataset
            datasets[subject] = filtered


### Non-English Abstracts
Hard-code the manually determined indices of the English and non-English abstracts:

In [6]:
actually_english = {
    'adhd': [521],
    'animal_depression': [16, 743, 1528],
    'pancreatic_surgery': [15500, 24419, 24904, 24951, 29967]
}

not_english = {
    'adhd': [667,759,785,801],
    'animal_depression': [266, 421, 675, 848, 968, 1162, 1250, 1521, 1644, 1919, 1947],
    'pancreatic_surgery': [200, 17096, 17944]
}

Replace the language code for the actual English abstracts with 'en'. Then, remove the abstracts of articles with a language code other than 'en'.

In [7]:
for subject, dataset in datasets.items():
    if subject in actually_english.keys():
        dataset =  dataset.with_columns(
            pl.when(
                pl.col('index').is_in(actually_english[subject]),
            ).then(
                pl.lit('en')
            ).otherwise(
                pl.col('language_abstract')).alias('language_abstract'
            )
        )

        dataset = dataset.with_columns(
            pl.when(
                pl.col('index').is_in(not_english[subject]),
            ).then(pl.lit(None))
            .otherwise(pl.col('abstract'))
            .alias('abstract')
        )

        datasets[subject] = dataset

Verify that non-English abstracts were indeed removed:

In [None]:
datasets['adhd'].vstack(
    datasets['animal_depression']
).vstack(
    datasets['pancreatic_surgery']
).filter(
    pl.col('language_abstract') != 'en'
).select(['index', 'abstract', 'language_abstract'])

### Long Abstracts
Remove abstracts that have more words than the limit of 489 words:

In [9]:
WORD_LIMIT = 489

for subject, dataset in datasets.items():

    # list of indices of abstracts that are above the word limit
    idx_long_abstracts = dataset.filter(
        pl.col('abstract_word_count') > WORD_LIMIT
    ).select('index').to_series().to_list()

    # remove abstracts that are above the word limit
    dataset = dataset.with_columns(
            pl.when(
                pl.col('index').is_in(idx_long_abstracts),
            ).then(pl.lit(None))
            .otherwise(pl.col('abstract'))
            .alias('abstract')
        )
    
    datasets[subject] = dataset

## Missing Abstracts
Remove articles without an abstract as they do not contain sufficient information to decide on inclusion or exclusion:

In [10]:
# to document, how many abstracts are removed
documentation = pl.DataFrame(
    {
        'dataset': datasets.keys(),
    }
)

# lengths of the datasets before removing articles with empty abstracts
lengths_before = [len(dataset) for dataset in datasets.values()]

# add the initial lengths of the datasets
documentation = documentation.with_columns(
    pl.Series("length_before", lengths_before)
)

In [11]:
# remove articles with empty abstracts
for subject, dataset in datasets.items():
    datasets[subject] = dataset.filter(
        pl.col('abstract').is_not_null()
    )

Validate, how many articles were removed due to missing abstracts:

In [None]:
# lengths of the datasets after removing articles with empty abstracts
lengths_after = [len(dataset) for dataset in datasets.values()]

# how many articles were removed, absolutely and relative to the total
documentation.with_columns(
    pl.Series("length_after", lengths_after)
).with_columns(
    (
        pl.col('length_before') - pl.col('length_after')
    ).alias('abstracts_removed')
).with_columns(
    (
        pl.col('abstracts_removed') / pl.col('length_before')
    ).alias('percentage_removed')
).sort(by='percentage_removed')

## Noise Removal
### HTML
HTML tags cause noise within the texts:

In [13]:
before = datasets['adhd'].to_pandas().iloc[[456, 565]].abstract.values

Define a function to automatically remove HTML from the text

In [None]:
from bs4 import BeautifulSoup

def remove_html(text: str)-> str:
    """Remove html tags from a string
    
    Args:
    text: str: a string containing html tags

    Returns:
    str: a string without html tags
    """
    return BeautifulSoup(text, 'html.parser').get_text()

Remove HTML from all datasets:

In [16]:
import polars as pl
import warnings

# BeautifulSoup thinks some titles to be similar to filenames 
warnings.filterwarnings('ignore', category=UserWarning)

for subject, dataset in datasets.items():
    datasets[subject] = dataset.with_columns([
        pl.col('title').map_elements(remove_html, return_dtype=pl.String),
        pl.col('abstract').map_elements(remove_html, return_dtype=pl.String)
    ])

Verify that HTML was indeed removed:

In [None]:
after = datasets['adhd'].to_pandas().iloc[[456, 565]].abstract.values

print('With HTML:', end='\n\n')
[print(abstract[:100], end='\n') for abstract in before];
print('\n\nWithout HTML:', end='\n\n')
[print(abstract[:100], end='\n') for abstract in after];

# Characters Only
Remove all digits and special characters by regular expressions:

In [18]:
import re

def remove_special_characters(text: str) -> str:
    """Remove special characters from a string
    
    Args:
    text: str: a string containing special characters

    Returns:
    str: a string without special characters
    """
    # remove newlines and carriage returns
    text = text.replace('\n', ' ').replace('\r', '')
    
    # matches characters only
    pattern = r'[^a-zA-Z\s]+'

    # apply the pattern to clean the string
    clean_string = re.sub(pattern, '', text)

    # ensure that there are no multiple spaces
    clean_string = ' '.join(clean_string.split())

    return clean_string

Validate that the function works as expected:

In [None]:
test = 'This,  i2s A  t3st!\n\r4nd  1t  w0rks.'
remove_special_characters(test)

Apply the function to all titles and abstracts to keep characters only:

In [20]:
for subject, dataset in datasets.items():
    datasets[subject] = dataset.with_columns([
        pl.col('title').map_elements(
            remove_special_characters,
            return_dtype=pl.String
        ),
        pl.col('abstract').map_elements(
            remove_special_characters,
            return_dtype=pl.String
        )
    ])

# Export
Export the preprocessed data for classification:

In [21]:
for subject, dataset in datasets.items():
    dataset.write_csv(f'{output_directory}/{subject}_preprocessed.csv')