# General Preprocessing
This notebook encompasses general preprocessing steps to prepare the texts for classification in all scenarios. 

These are:
- Removing HTML
- Translating non-English texts
- Filtering texts above or below a common word count
- Removing duplicate entries

# Preprocessing
## Preparation
### Imports

In [1]:
from src import data
import polars as pl
import re

### Datasets

In [2]:
# Define the directory where the data is stored
directory = '../../../../data/datasets/03_pubmed'

# Output directory
output_directory = '../../../../data/datasets/04_preprocessed'

# Load the data
datasets = data.dict_from_directory(directory, type='polars')

# individual datasets
adhd = datasets['adhd']
animal_depression = datasets['animal_depression']
atypical_antipsychotics = datasets['atypical_antipsychotics']
calcium_channel_blockers = datasets['calcium_channel_blockers']
oral_hypoglycemics = datasets['oral_hypoglycemics']
pancreatic_surgery = datasets['pancreatic_surgery']

# combine all datasets
all_datasets = pl.DataFrame(
)

for subject, dataset in datasets.items():
    all_datasets = all_datasets.vstack(dataset.with_columns(pl.lit(subject).alias('dataset')))

## Removal of Abstracts and Articles
Remove non-English abstracts and abstracts with more than 489 words.

Afterwards, remove articles without an abstract, as those do not contain sufficient information to decide on inclusion or exclusion in title/abstract-screening.

### Duplicates

In [3]:
known_duplicates = {
    'animal_depression': ['doi', 'openalex_id', 'pubmed_id'],
    'pancreatic_surgery': ['doi', 'pubmed_id', 'webofscience_id']
}

for subject, dataset in datasets.items():
    if subject in known_duplicates.keys():
        for column in known_duplicates[subject]:

            # save nulls as they are lost to .unique()
            null = datasets[subject].filter(
                pl.col(column).is_null()
            )

            # drop duplicate values - also drops null values
            unique = datasets[subject].filter(
                pl.col(column).is_unique()
            )

            # combine the unique and null values
            filtered = unique.vstack(null)

            # override the dataset with the filtered dataset
            datasets[subject] = filtered


### Non-English Abstracts
Hard-code the manually determined indices of the English and non-English abstracts:

In [4]:
actually_english = {
    'adhd': [521],
    'animal_depression': [16, 743, 1528],
    'pancreatic_surgery': [15500, 24419, 24904, 24951, 29967]
}

not_english = {
    'adhd': [667,759,785,801],
    'animal_depression': [266, 421, 675, 848, 968, 1162, 1250, 1521, 1644, 1919, 1947],
    'pancreatic_surgery': [200, 17096, 17944]
}

Replace the language code for the actual English abstracts with 'en'. Then, remove the abstracts of articles with a language code other than 'en'.

In [5]:
for subject, dataset in datasets.items():
    if subject in actually_english.keys():
        dataset =  dataset.with_columns(
            pl.when(
                pl.col('index').is_in(actually_english[subject]),
            ).then(
                pl.lit('en')
            ).otherwise(
                pl.col('language_abstract')).alias('language_abstract'
            )
        )

        dataset = dataset.with_columns(
            pl.when(
                pl.col('index').is_in(not_english[subject]),
            ).then(pl.lit(None))
            .otherwise(pl.col('abstract'))
            .alias('abstract')
        )

        datasets[subject] = dataset

Verify that non-English abstracts were indeed removed:

In [6]:
datasets['adhd'].vstack(
    datasets['animal_depression']
).vstack(
    datasets['pancreatic_surgery']
).filter(
    pl.col('language_abstract') != 'en'
).select(['index', 'abstract', 'language_abstract'])

index,abstract,language_abstract
u32,str,str
667,,"""ceb"""
759,,"""de"""
785,,"""pt"""
801,,"""de"""
266,,"""de"""
…,…,…
1919,,"""de"""
1947,,"""war"""
200,,"""de"""
17096,,"""es"""


### Long Abstracts
Remove abstracts that have more words than the limit of 489 words:

In [7]:
WORD_LIMIT = 489

for subject, dataset in datasets.items():

    # list of indices of abstracts that are above the word limit
    idx_long_abstracts = dataset.filter(
        pl.col('abstract_word_count') > WORD_LIMIT
    ).select('index').to_series().to_list()

    # remove abstracts that are above the word limit
    dataset = dataset.with_columns(
            pl.when(
                pl.col('index').is_in(idx_long_abstracts),
            ).then(pl.lit(None))
            .otherwise(pl.col('abstract'))
            .alias('abstract')
        )
    
    datasets[subject] = dataset

## Missing Abstracts
Remove articles without an abstract as they do not contain sufficient information to decide on inclusion or exclusion:

In [8]:
# to document, how many abstracts are removed
documentation = pl.DataFrame(
    {
        'dataset': datasets.keys(),
    }
)

# lengths of the datasets before removing articles with empty abstracts
lengths_before = [len(dataset) for dataset in datasets.values()]

# add the initial lengths of the datasets
documentation = documentation.with_columns(
    pl.Series("length_before", lengths_before)
)

In [9]:
# remove articles with empty abstracts
for subject, dataset in datasets.items():
    datasets[subject] = dataset.filter(
        pl.col('abstract').is_not_null()
    )

Validate, how many articles were removed due to missing abstracts:

In [10]:
# lengths of the datasets after removing articles with empty abstracts
lengths_after = [len(dataset) for dataset in datasets.values()]

# how many articles were removed, absolutely and relative to the total
documentation.with_columns(
    pl.Series("length_after", lengths_after)
).with_columns(
    (
        pl.col('length_before') - pl.col('length_after')
    ).alias('abstracts_removed')
).with_columns(
    (
        pl.col('abstracts_removed') / pl.col('length_before')
    ).alias('percentage_removed')
).sort(by='percentage_removed')

dataset,length_before,length_after,abstracts_removed,percentage_removed
str,i64,i64,i64,f64
"""adhd""",851,798,53,0.06228
"""atypical_antipsychotics""",1120,1049,71,0.063393
"""calcium_channel_blockers""",1218,1129,89,0.073071
"""oral_hypoglycemics""",503,458,45,0.089463
"""pancreatic_surgery""",34180,30252,3928,0.114921
"""animal_depression""",1989,1691,298,0.149824


## Noise Removal
### HTML
HTML tags cause noise within the texts:

In [11]:
before = datasets['adhd'].to_pandas().iloc[[456, 565]].abstract.values

Define a function to automatically remove HTML from the text

In [12]:
from bs4 import BeautifulSoup

def remove_html(text: str)-> str:
    """Remove html tags from a string
    
    Args:
    text: str: a string containing html tags

    Returns:
    str: a string without html tags
    """
    return BeautifulSoup(text, 'html.parser').get_text()

Remove HTML from all datasets:

In [13]:
import polars as pl
import warnings

# BeautifulSoup thinks some titles to be similar to filenames 
warnings.filterwarnings('ignore', category=UserWarning)

for subject, dataset in datasets.items():
    datasets[subject] = dataset.with_columns([
        pl.col('title').map_elements(remove_html, return_dtype=pl.String),
        pl.col('abstract').map_elements(remove_html, return_dtype=pl.String)
    ])

Verify that HTML was indeed removed:

In [14]:
after = datasets['adhd'].to_pandas().iloc[[456, 565]].abstract.values

print('With HTML:', end='\n\n')
[print(abstract[:100], end='\n') for abstract in before];
print('\n\nWithout HTML:', end='\n\n')
[print(abstract[:100], end='\n') for abstract in after];

With HTML:

Patients with myotonic dystrophy frequently suffer from excess daytime sleepiness, which can be a si
In a randomized, double-blind study in children undergoing elective orthopaedic surgery, we have ass


Without HTML:

Patients with myotonic dystrophy frequently suffer from excess daytime sleepiness, which can be a si
In a randomized, double-blind study in children undergoing elective orthopaedic surgery, we have ass


# Characters Only
Remove all digits and special characters by regular expressions:

In [15]:
import re

def remove_special_characters(text: str) -> str:
    """Remove special characters from a string
    
    Args:
    text: str: a string containing special characters

    Returns:
    str: a string without special characters
    """
    # remove newlines and carriage returns
    text = text.replace('\n', ' ').replace('\r', '')
    
    # matches characters only
    pattern = r'[^a-zA-Z\s]+'

    # apply the pattern to clean the string
    clean_string = re.sub(pattern, '', text)

    # ensure that there are no multiple spaces
    clean_string = ' '.join(clean_string.split())

    return clean_string

Validate that the function works as expected:

In [16]:
test = 'This,  i2s A  t3st!\n\r4nd  1t  w0rks.'
remove_special_characters(test)

'This is A tst nd t wrks'

Apply the function to all titles and abstracts to keep characters only:

In [17]:
for subject, dataset in datasets.items():
    datasets[subject] = dataset.with_columns([
        pl.col('title').map_elements(
            remove_special_characters,
            return_dtype=pl.String
        ),
        pl.col('abstract').map_elements(
            remove_special_characters,
            return_dtype=pl.String
        )
    ])

# Export
Export the preprocessed data for classification:

In [18]:
for subject, dataset in datasets.items():
    dataset.write_csv(f'{output_directory}/{subject}_preprocessed.csv')