In [26]:
import pandas as pd
import re
from fractions import Fraction
import unicodedata

In [27]:
# Read the two CSV files
df_part1 = pd.read_csv('pubmed_data_part1.csv')
df_part2 = pd.read_csv('pubmed_data_part2.csv')

# Concatenate the two DataFrames
df = pd.concat([df_part1, df_part2], ignore_index=True)

pd.set_option('display.max_colwidth', None)
df.head(2)

Unnamed: 0,PMID,Title,Abstract,Authors,Publication Date,DOI
0,24645995,α-1 antitrypsin and chronic fatigue syndrome: a case study from pathophysiology to clinical practice.,"SUMMARY Several lines of evidence support the involvement of inflammatory and immunologic abnormalities in chronic fatigue syndrome (CFS). Since recent studies have shown that α-1 antitrypsin (AAT) possesses anti-inflammatory properties, the potential therapeutic effect of AAT treatment on CFS has been investigated. A 49-year-old woman diagnosed with CFS was treated with intravenous infusions of a human plasma-derived AAT concentrate (60 mg/kg body weight weekly for 8 consecutive weeks). The patient's monocyte elastase, a regulator of inflammatory processes, was 1170 U/mg. At completion of treatment, improvement in maximal workload was observed (54.0-71.7% of predicted). Additionally, amelioration in working memory (scores: 83-94) and perceptual organization (scores: 75-83) were detected on the Wechsler Adult Intelligence Scale-III test. Monocyte elastase decreased to a normal range (<150 U/mg). Improvement in functional capacity allowed the patient to work in part-time employment. These findings suggest a possible role for AAT in the treatment of CFS.",José Alegre; Sandra Camprubí; Ana García-Quintana,2013-Mar,10.2217/pmt.12.84
1,24565439,A data-driven acute inflammation therapy.,"Acute inflammation is a severe medical condition defined as an inflammatory response of the body to an infection. Its rapid progression requires quick and accurate decisions from clinicians. Inadequate and delayed decisions makes acute inflammation the 10th leading cause of death overall in United States with the estimated cost of treatment about $17 billion annually. However, despite the need, there are limited number of methods that could assist clinicians to determine optimal therapies for acute inflammation. We developed a data-driven method for suggesting optimal therapy by using machine learning model that is learned on historical patients' behaviors. To reduce both the risk of failure and the expense for clinical trials, our method is evaluated on a virtual patients generated by a mathematical model that emulates inflammatory response. In conducted experiments, acute inflammation was handled with two complimentary pro- and anti-inflammatory medications which adequate timing and doses are crucial for the successful outcome. Our experiments show that the dosage regimen assigned with our data-driven method significantly improves the percentage of healthy patients when compared to results by other methods used in clinical practice and found in literature. Our method saved 88% of patients that would otherwise die within a week, while the best method found in literature saved only 73% of patients. At the same time, our method used lower doses of medications than alternatives. In addition, our method achieved better results than alternatives when only incomplete or noisy measurements were available over time as well as it was less affected by therapy delay. The presented results provide strong evidence that models from the artificial intelligence community have a potential for development of personalized treatment strategies for acute inflammation.",Vladan Radosavljevic; Kosta Ristovski; Zoran Obradovic,2013,10.1186/1755-8794-6-S3-S7


In [22]:
print(df.shape)
print(df.isnull().sum())  # Check for missing values
df.columns  # Check all column names 

(58854, 6)
PMID                  0
Title                 0
Abstract              4
Authors              88
Publication Date      4
DOI                 868
dtype: int64


Index(['PMID', 'Title', 'Abstract', 'Authors', 'Publication Date', 'DOI'], dtype='object')

In [29]:
# Clean the data

# Remove results with missing abstracts
df = df.dropna(subset=['Abstract'])

# Remove abstracts shorter than 40 characters
df = df[df['Abstract'].apply(lambda x: len(x) >= 100)]

# Replace missing values with "Unknown"
df['Authors'] = df['Authors'].fillna("Unknown")
df['Publication Date'] = df['Publication Date'].fillna("Unknown")
df['DOI'] = df['DOI'].fillna("Unknown")


# Remove special characters and normalize diacritics and accents in names
def remove_special_characters(text, keep_semicolons=False):
    without_diacritics = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

    if keep_semicolons:
        # Keep semicolons
        return re.sub(r'[^\w\s;]', '', without_diacritics)
    else:
        # Remove all special characters
        return re.sub(r'[^\w\s]', '', without_diacritics)

df['Authors'] = df['Authors'].apply(lambda x: remove_special_characters(x, keep_semicolons=True))
df['Abstract'] = df['Abstract'].apply(remove_special_characters)

# Remove non-breaking spaces and trailing whitespaces
df['Abstract'] = df['Abstract'].str.replace('\xa0', ' ')
df['Abstract'] = df['Abstract'].str.strip().str.replace('\s+', ' ', regex=True)


df.head(2)

Unnamed: 0,PMID,Title,Abstract,Authors,Publication Date,DOI
0,24645995,α-1 antitrypsin and chronic fatigue syndrome: a case study from pathophysiology to clinical practice.,SUMMARY Several lines of evidence support the involvement of inflammatory and immunologic abnormalities in chronic fatigue syndrome CFS Since recent studies have shown that α1 antitrypsin AAT possesses antiinflammatory properties the potential therapeutic effect of AAT treatment on CFS has been investigated A 49yearold woman diagnosed with CFS was treated with intravenous infusions of a human plasmaderived AAT concentrate 60 mgkg body weight weekly for 8 consecutive weeks The patients monocyte elastase a regulator of inflammatory processes was 1170 Umg At completion of treatment improvement in maximal workload was observed 540717 of predicted Additionally amelioration in working memory scores 8394 and perceptual organization scores 7583 were detected on the Wechsler Adult Intelligence ScaleIII test Monocyte elastase decreased to a normal range 150 Umg Improvement in functional capacity allowed the patient to work in parttime employment These findings suggest a possible role for AAT in the treatment of CFS,Jose Alegre; Sandra Camprubi; Ana GarciaQuintana,2013-Mar,10.2217/pmt.12.84
1,24565439,A data-driven acute inflammation therapy.,Acute inflammation is a severe medical condition defined as an inflammatory response of the body to an infection Its rapid progression requires quick and accurate decisions from clinicians Inadequate and delayed decisions makes acute inflammation the 10th leading cause of death overall in United States with the estimated cost of treatment about 17 billion annually However despite the need there are limited number of methods that could assist clinicians to determine optimal therapies for acute inflammation We developed a datadriven method for suggesting optimal therapy by using machine learning model that is learned on historical patients behaviors To reduce both the risk of failure and the expense for clinical trials our method is evaluated on a virtual patients generated by a mathematical model that emulates inflammatory response In conducted experiments acute inflammation was handled with two complimentary pro and antiinflammatory medications which adequate timing and doses are crucial for the successful outcome Our experiments show that the dosage regimen assigned with our datadriven method significantly improves the percentage of healthy patients when compared to results by other methods used in clinical practice and found in literature Our method saved 88 of patients that would otherwise die within a week while the best method found in literature saved only 73 of patients At the same time our method used lower doses of medications than alternatives In addition our method achieved better results than alternatives when only incomplete or noisy measurements were available over time as well as it was less affected by therapy delay The presented results provide strong evidence that models from the artificial intelligence community have a potential for development of personalized treatment strategies for acute inflammation,Vladan Radosavljevic; Kosta Ristovski; Zoran Obradovic,2013,10.1186/1755-8794-6-S3-S7


In [47]:
df.shape

(58535, 7)

In [30]:
df['Abstract_Length'] = df['Abstract'].apply(len)

# Sort the DataFrame by the length of abstracts
df_sorted = df.sort_values('Abstract_Length')

# Display the shortest abstracts (abstracts can be shorter than 100 characters due to cleaning)
df_sorted.head(3)

Unnamed: 0,PMID,Title,Abstract,Authors,Publication Date,DOI,Abstract_Length
23822,33671343,Changing Dental Profession-Modern Forms and Challenges in Dental Practice.,In the last two decades an increasing trend towards new forms of dental practice was observed,Thomas Gerhard Wolf; Guglielmo Campus,2021-Feb-17,10.3390/ijerph18041945,93
29396,34943534,"Special Issue ""Advances in Breast MRI"".",We thank all the authors reviewers and the editorial staff who contributed to this Special Issue,Francesca Galati; Rubina Manuela Trimboli; Federica Pediconi,2021-Dec-08,10.3390/diagnostics11122297,96
29222,34972694,Integrating artificial intelligence in bedside care for covid-19 and future pandemics.,bMichael Yu and colleaguesb examine the challenges in developing AI tools for use at point of care,Michael Yu; An Tang; Kip Brown; Rima Bouchakri; Pascal StOnge; Sheng Wu; John Reeder; Louis Mullie; Michael Chasse,2021-Dec-31,10.1136/bmj-2021-068197,98


In [32]:
df_sorted['Abstract_Length'].tail(3)

26424     7387
37778     7970
5992     60664
Name: Abstract_Length, dtype: int64

In [40]:
contains = df['Abstract'].str.contains("Changing Dental", case=False)

# Display rows where 'Abstract' contains 'Abstract: NA'
print(df[contains])

Empty DataFrame
Columns: [PMID, Title, Abstract, Authors, Publication Date, DOI, Abstract_Length]
Index: []


In [46]:
# Save files
split_index = df.shape[0] // 2

# Split the DataFrame into two parts
df_part1 = df.iloc[:split_index]
df_part2 = df.iloc[split_index:]

# Save each part to a CSV file
df_part1.to_csv('processed_data_part1.csv', index=False)
df_part2.to_csv('processed_data_part2.csv', index=False)