# General Preprocessing
This notebook encompasses general preprocessing steps to prepare the texts for classification in all scenarios. 

These are:
- Removing HTML
- Translating non-English texts
- Filtering texts above or below a common word count
- Removing duplicate entries

In [1]:
from src import data

# Define the directory where the data is stored
directory = '../../../../data/datasets/03_pubmed'

# Output directory
output_directory = '../../../../data/04_preprocessed'

# Load the data
datasets = data.dict_from_directory(directory, type='polars')

# Remove HTML
HTML tags cause noise within the texts:

In [2]:
before = datasets['adhd'].to_pandas().iloc[[456, 565]].abstract.values

Define a function to automatically remove HTML from the text

In [3]:
from bs4 import BeautifulSoup

def remove_html(text: str)-> str:
    """Remove html tags from a string
    
    Args:
    text: str: a string containing html tags

    Returns:
    str: a string without html tags
    """
    return BeautifulSoup(text, 'html.parser').get_text()

Remove HTML from all datasets:

In [4]:
import polars as pl
import warnings

# BeautifulSoup thinks some titles to be similar to filenames 
warnings.filterwarnings('ignore', category=UserWarning)

for subject, dataset in datasets.items():
    datasets[subject] = dataset.with_columns([
        pl.col('title').map_elements(remove_html, return_dtype=pl.String),
        pl.col('abstract').map_elements(remove_html, return_dtype=pl.String)
    ])

Verify that HTML was indeed removed:

In [5]:
after = datasets['adhd'].to_pandas().iloc[[456, 565]].abstract.values

print('HTML removal:', end='\n\n')
for i in range(0,2):
    print('Before: ', before[i][:50])
    print('After: ', after[i][:50], end='\n\n')

HTML removal:

Before:  <b><i>Objective:</i></b> To evaluate the efficacy 
After:  Objective: To evaluate the efficacy and safety of 

Before:  <h3>Objective:</h3> To compare the intraocular pre
After:  Objective: To compare the intraocular pressure (IO



# Translation

Some texts are in a language other than English:

In [6]:
atpyical_antipsychotics = datasets['atypical_antipsychotics']
japanese_abstract = atpyical_antipsychotics.to_pandas().iloc[99]['abstract']

print(japanese_abstract)

シグマ受容体拮抗物質に関する最近の研究成果を精神分裂病治療薬との関連でまとめた．選択的シグマ1受容体拮抗物質であるNE-100はphencyclidine(PCP)誘発異常行動改善作用および認知障害改善作用を有するが，ドパミンアゴニスト誘発行動には影響を及ぼさず，カタレプシー惹起性も認められない．NE-100の作用機序はN-methyl-D-aspartate(NMDA)/PCP受容体イオンチャネル複合体の間接的な修飾作用およびドパミン作動性神経終末でドパミンの遊離調節作用が関与している可能性が示唆されている．また，最近報告された選択的シグマ1受容体拮抗物質MS-355/MS-377はNE-100と類似の薬理学的プロファイルを有するが，さらに，ドパミンD2受容体拮抗薬と同様にPCP誘発立ち上がり行動を抑制し，アポモルヒネクライミング行動およびメタンフェタミン逆耐性形成も抑制する．精神分裂病をターゲットとしたシグマ1受容体拮抗物質の臨床試験ではrimcazole，remoxipride，BMY 14802，panamesine(EMD 57445)およびSL82.0715について報告されている．Rimcazoleはオープン試験では有効であったが，ダブルブラインド試験で痙攣が誘発され，臨床試験を断念している．RemoxiprideはドパミンD2受容体拮抗薬と異なった治療効果を示したが，再生不良性貧血の副作用のため開発を断念している．BMY 14802は精神分裂病に無効であり，panamesineおよびSh82.0714は精神分裂病を対象としたオープン試験では好成績をあげている．本総説では最近のシグマ受容体拮抗物質の薬理学的プロファイルおよび精神分裂病を対象とした臨床試験成績を中心にまとめた．


Define a function translating texts to English using the DeepL API:

In [7]:
import os, dotenv, deepl

def deepl_translate(text: str, source_lang=None, target_lang: str = 'EN-US')-> str:
    """Translate a text to a target language
    
    Args:
    text: str: a string to be translated
    target_lang: str: the target language
    
    Returns:
    str: the translated text
    """
    dotenv.load_dotenv()

    deepl_api_key = os.getenv('DEEPL')
    translator = deepl.Translator(deepl_api_key)
    
    # Translate the text
    translated = translator.translate_text(text, source_lang=source_lang, target_lang=target_lang)
    return translated.text

Define a function to detect a text's language and translate non-English texts:

In [8]:
import langdetect

def translate_non_english(text: str)-> str:
    """Translate a text to English if it is not in English
    
    Args:
    text: str: a string to be translated
    
    Returns:
    str: the translated text
    """

    # Detect the language
    try:
        source_lang = langdetect.detect(text[:500]).upper()
    except:
        return text
    
    # Translate the text if it is not in English
    if source_lang != 'EN':
        try:
            return deepl_translate(text, source_lang=source_lang)
        except:
            return text
    else:
        return text

Apply the translation:

In [9]:
from tqdm import tqdm

for subject, dataset in tqdm(datasets.items(), total=len(datasets)):
    datasets[subject] = dataset.with_columns(
        pl.col('abstract').map_elements(translate_non_english, return_dtype=pl.String)
    )

100%|██████████| 6/6 [02:12<00:00, 22.01s/it]


# Text Length
While most abstracts adhere to the common word limits between 150 and 500 words, there are also outliers:

In [10]:
import pandas as pd

word_counts = pd.DataFrame(columns=['dataset', 'n', 'mean', 'std', 'min', '5%', '10%', '25%', '50%', '75%', '90%', '95%', '99%', 'max'])

for subject, dataset in datasets.items():

    n = dataset.height

    column = dataset['abstract_word_count']

    mean = int(column.mean())
    std = int(column.std())
    min = int(column.min())
    q5 = int(column.quantile(0.05))
    q10 = int(column.quantile(0.10))
    q25 = int(column.quantile(0.25))
    q50 = int(column.quantile(0.50))
    q75 = int(column.quantile(0.75))
    q90 = int(column.quantile(0.90))
    q95 = int(column.quantile(0.95))
    q99 = int(column.quantile(0.99))
    max = int(column.max())

    summary = pd.Series([subject, n, mean, std, min, q5, q10, q25, q50, q75, q90, q95, q99, max], index=word_counts.columns)

    word_counts = pd.concat([word_counts, summary.to_frame().T])

word_counts

Unnamed: 0,dataset,n,mean,std,min,5%,10%,25%,50%,75%,90%,95%,99%,max
0,adhd,851,275,334,0,99,137,182,249,311,384,454,724,5411
0,animal_depression,1993,210,253,0,0,0,128,210,277,327,376,545,5414
0,atypical_antipsychotics,1120,273,376,0,68,108,171,232,295,368,446,1281,5411
0,calcium_channel_blockers,1218,291,295,0,49,118,206,278,334,414,463,921,4999
0,oral_hypoglycemics,503,296,266,0,82,129,213,288,352,433,490,684,5190
0,pancreatic_surgery,34206,223,447,0,0,43,139,225,295,360,415,583,78903


Typical abstract lengths range between 150 and 400 words. 

Does the data follow this pattern?

In [11]:
first_quartile = word_counts['25%'].min()
third_quartile = word_counts['75%'].max()

print(
    'First quartile: ', first_quartile,
    'Third quartile: ', third_quartile, sep='\n'
)

First quartile: 
128
Third quartile: 
352


It does!

Filter the datasets to only include abstracts longer than the the first and shorter than the third quartile

In [12]:
for subject, dataset in datasets.items():
    datasets[subject] = dataset.filter(pl.col('abstract_word_count').gt(
        first_quartile) & pl.col('abstract_word_count').lt(third_quartile))

# Duplicates

Define a function that filters duplicate rows from a given column from:

In [13]:
def remove_duplicates(
        dataframe: pl.DataFrame,
        column: str
        ):

    dataframe = dataframe.with_columns(
        pl.col(column).cast(pl.String)
    ).with_columns(
        pl.col(column).str.to_lowercase()
    ).with_columns(
        pl.col(column).str.strip_chars()
    ).with_columns(
        pl.col(column).str.replace_all(r"\[^()]+]", '')
    )

    rows_empty = dataframe.filter(pl.col(column).is_null())
    rows_not_empty = dataframe.filter(pl.col(column).is_not_null())

    rows_duplicates_removed = rows_not_empty.unique(column, maintain_order=True)
    num_rows_removed = dataframe.height - (rows_empty.height + rows_duplicates_removed.height)

    return (pl.concat([rows_empty, rows_duplicates_removed]), num_rows_removed)


Use the function to filter duplicate rows based on ids, titles and abstracts:

In [14]:
import pandas as pd

duplicates = pd.DataFrame(columns=['subject', 'removed', 'by'])

columns = ['pubmed_id', 'doi', 'title', 'abstract']

for subject, dataset in datasets.items():
    for column in columns:
        dataset, num_removed = remove_duplicates(dataset, column)

        if num_removed > 0:
            duplicates = pd.concat([
                duplicates,
                pd.DataFrame({
                    'subject': [subject],
                    'removed': [num_removed],
                    'by': [column],
                })
            ])
        
    datasets[subject] = dataset

Inspect, how many duplicates were removed and by what column:

In [15]:
duplicates

Unnamed: 0,subject,removed,by
0,adhd,1,title
0,animal_depression,1,pubmed_id
0,atypical_antipsychotics,2,title
0,oral_hypoglycemics,2,title
0,pancreatic_surgery,6,pubmed_id
0,pancreatic_surgery,1,doi
0,pancreatic_surgery,288,title
0,pancreatic_surgery,154,abstract


# Export
Export the preprocessed data for classification:

In [16]:
for subject, dataset in datasets.items():
    dataset.write_csv(f'{output_directory}/{subject}_preprocessed.csv')

# Similar Titles - WORK IN PROGRESS
There are a lot of articles with completely duplicate or very similar titles in the pancreatic surgery dataset.
Among them are different articles, and same articles published twice.
In some cases, duplicates have a different include label.

Think about, how to handle these cases...

In [61]:
# # Define the directory where the data is stored
# directory_uniform = '../../../../data/01_uniform'

# # Load the data
# datasets_uniform = data.dict_from_directory(directory_uniform, type='polars')

In [25]:
# df = datasets['pancreatic_surgery']

# # edited = df.with_columns(
# #     pl.col('title').str.to_lowercase()
# # ).with_columns(
# #     pl.col('title').str.strip_chars()
# # ).with_columns(
# #     pl.col('title').str.replace_all(r"\[^()]+]", '')
# # ).with_columns(
# #     pl.col('abstract').str.to_lowercase()
# # ).with_columns(
# #     pl.col('abstract').str.strip_chars()
# # ).with_columns(
# #     pl.col('abstract').str.replace_all(r"\[^()]+]", '')
# # )

# #duplicate_titles = edited.filter(pl.col('title').is_not_null()).filter(pl.col('title').is_duplicated()).filter(pl.col('abstract').is_duplicated()).sort('title')
# duplicate_articles = df.filter(
#     pl.col('title').is_not_null()).filter(
#         pl.col('title').is_duplicated()).filter(
#             pl.col('abstract').is_not_null()).filter(
#                 pl.col('abstract').is_duplicated()).sort('title')

# duplicate_articles

include,title,abstract,first_author,year,journal,doi,pubmed_id,authors,pubmed_type,publication_types,mesh,webofscience_id,central_id,openalex_id
bool,str,str,str,f64,str,str,f64,str,str,str,str,str,str,str


In [26]:
# from difflib import SequenceMatcher

# def similar(a, b):
#     return SequenceMatcher(None, a, b).ratio()

In [31]:
# import polars as pl

# similars = []

# def compare_titles_optimized(df):
#     titles = df['title'].to_list()
#     processed_titles = set()

#     for i, title1 in enumerate(titles):
#         if title1 in processed_titles:
#             continue

#         for j, title2 in enumerate(titles[i+1:]):
#             if title2 in processed_titles:
#                 continue
            
#             if title1 is not None and title2 is not None:
#                 similarity = similar(title1, title2)
#                 if similarity > 0.9:
#                     print(f"Similar titles found at indices {i} and {j+i+1} with similarity ({similarity}):", title1, title2, sep='\n', end='\n\n')
#                     similars.append((i, j+i+1))

#             processed_titles.add(title1)

# compare_titles_optimized(df)

Similar titles found at indices 4 and 222 with similarity (0.9926470588235294):
peer review report 3 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"
peer review report 1 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"

Similar titles found at indices 4 and 394 with similarity (0.9926470588235294):
peer review report 3 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"
peer review report 2 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"

Similar titles found at indices 6 and 949 with similarity (0.9432624113475178):
pylorus resection does not reduce delayed gastric emptying after partial pancreatoduodenectomy: a blinded randomized controlled trial.
pylorus resection does not reduce delayed gastric emptying after partial pa

KeyboardInterrupt: 

In [42]:
# indices = set([i for i, j in similars] + [j for i, j in similars])

In [41]:
# df.to_pandas().iloc[list(indices)].sort_values('title')

Unnamed: 0,include,title,abstract,first_author,year,journal,doi,pubmed_id,authors,pubmed_type,publication_types,mesh,webofscience_id,central_id,openalex_id
13780,False,[conservative treatment of acute pancreatitis].,"the incidence of acute pancreatitis per 100,00...",Mayerle J,2003.0,Med Klin (Munich),10.1007/s00063-003-1320-7,14685675.0,Mayerle J; Simon P; Kraft M; Meister T; Lerch MM,article,D016428: Journal Article; D016454: Review,D018806: APACHE; D000208: Acute Disease; D0002...,,,
20,False,[postoperative treatment of acute pancreatitis].,,Agaev BA,2009.0,Khirurgiia (Mosk),,19798777.0,Agaev BA; Mamedova NA,article,D016428: Journal Article; D016454: Review,D005765: Gastrointestinal Agents; D006801: Hum...,,,
19,False,[stem cells in diabetes treatment].,,Heinis M,2008.0,Ann Endocrinol (Paris),10.1016/j.ando.2008.02.014,18420179.0,Heinis M; Duvillié B,article,D016428: Journal Article; D016454: Review,D000818: Animals; D049109: Cell Proliferation;...,,,
40,False,"a commentary on ""endoscopic versus percutaneou...",,Wang C,2022.0,Int J Surg,10.1016/j.ijsu.2022.106719,35764252.0,Wang C; Jiang T,article,D016422: Letter; D016449: Randomized Controlle...,D004322: Drainage; D006801: Humans; D006932: H...,,,
479,False,"a commentary on ""endoscopic versus percutaneou...",,Radulović A,2021.0,Int J Surg,10.1016/j.ijsu.2021.106144,34688931.0,Radulović A; Ganesananthan S; Oakley Z,article,D016422: Letter; D016420: Comment,D004322: Drainage; D006801: Humans; D006932: H...,,,
54,True,"a multicenter, randomized, controlled trial co...",,Goh BKP,2020.0,Ann Surg Oncol,10.1245/s10434-020-08401-0,32221735.0,Goh BKP,article,D016422: Letter; D016448: Multicenter Study; D...,D006801: Humans; D010180: Pancreatectomy; D010...,,,
6804,True,"a multicenter, randomized, controlled trial co...",background: although distal pancreatectomy (dp...,Kondo N,2019.0,Ann Surg Oncol,10.1245/s10434-019-07222-0,30783854.0,Kondo N; Uemura K; Nakagawa N; Okada K; Kuroda...,article,D003160: Comparative Study; D016428: Journal A...,D000368: Aged; D005260: Female; D005500: Follo...,,,
26,True,does pancreatic duct stenting decrease the rat...,,,,,,,,,,,WOS:000236961706376,,
963,True,does pancreatic duct stenting decrease the rat...,,,,,,,,,,,WOS:000242649700016,,
19898,True,does pancreatic duct stenting decrease the rat...,pancreatic duct stenting remains an attractive...,Winter JM,2006.0,J Gastrointest Surg,10.1016/j.gassur.2006.07.020,17114014.0,Winter JM; Cameron JL; Campbell KA; Chang DC; ...,article,D016428: Journal Article; D016449: Randomized ...,"D000328: Adult; D000368: Aged; D000369: Aged, ...",,,


In [27]:
# df = datasets['pancreatic_surgery']

# similars = []

# def compare_titles(df):
#     titles = df['title'].to_list()
#     for i in range(len(titles)):
#         for j in range(i+1, len(titles)):
#             title1 = titles[i]
#             title2 = titles[j]
#             if title1 is not None and title2 is not None:
#                 similarity = similar(title1, title2)
#                 if similarity > 0.75:
#                     print(f"Similar titles found at indices {i} and {j} with similarity ({similarity}):", title1, title2, sep='\n', end='\n\n')
#                     similars.append((i, j))

# compare_titles(df)

Similar titles found at indices 4 and 222 with similarity (0.9926470588235294):
peer review report 3 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"
peer review report 1 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"

Similar titles found at indices 4 and 394 with similarity (0.9926470588235294):
peer review report 3 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"
peer review report 2 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"

Similar titles found at indices 4 and 23296 with similarity (0.8906882591093117):
peer review report 3 on "positron emission tomography modalities prevent futile radical resection of pancreatic cancer: a meta-analysis"
positron emission tomography modalities prevent futile radical resectio

KeyboardInterrupt: 