# Text Abstractions: Articles Skimming

In [1]:
# Importing TF and checking the version
import tensorflow as tf

print(tf.__version__)

2.10.1


Re-using PubMed dataset, published by Franck Dernoncourt. Dataset contains 20K records, with numbers replaced by "@" sign.

In [2]:
# Checking all the file names in the target directory
import os

data_dir = "PubMed_20K_RCT_NumReplaced/"
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

['PubMed_20K_RCT_NumReplaced/dev.txt',
 'PubMed_20K_RCT_NumReplaced/test.txt',
 'PubMed_20K_RCT_NumReplaced/train.txt']

In [4]:
# Creating function to read lines of code
def get_lines(filename):
    with open(filename, "r") as f:
        return f.readlines()

In [5]:
# Reading train file lines
train_lines = get_lines(data_dir + "train.txt")
train_lines[:20]

['###24293578\n',
 'OBJECTIVE\tTo investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .\n',
 'METHODS\tA total of @ patients with primary knee OA were randomized @:@ ; @ received @ mg/day of prednisolone and @ received placebo for @ weeks .\n',
 'METHODS\tOutcome measures included pain reduction and improvement in function scores and systemic inflammation markers .\n',
 'METHODS\tPain was assessed using the visual analog pain scale ( @-@ mm ) .\n',
 'METHODS\tSecondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and @-min walk distance ( @MWD ) .\n',
 'METHODS\tSerum levels of interleukin @ ( IL-@ ) , IL-@ , tumor necrosis factor ( TNF ) - , and 

In [6]:
# Checking number of lines
len(train_lines)

210040

In [7]:
# Creating function to preprocess input texts
def preprocess_text_with_line_numbers(filename):
    input_lines = get_lines(filename)
    abstract_lines = "" # Creating empty abstract
    abstract_samples = [] # Creating empty list of abstracts

    for line in input_lines:
        if line.startswith("###"): # Check if it's an ID line
            abstract_id = line
            abstract_lines = "" # Reset if line is an ID line

        elif line.isspace(): # Check if it's a new line
            abstract_line_split = abstract_lines.splitlines()
            # Iterating through each line
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data = {} # Create empty dictionary for each line
                target_text_split = abstract_line.split("\t")
                line_data["target"] = target_text_split[0] # Get target label
                line_data["text"] = target_text_split[1].lower() # Get target text
                line_data["line_number"] = abstract_line_number # What line text appears on
                line_data["total_lines"] = len(abstract_line_split) - 1 # How many lines are there
                abstract_samples.append(line_data)

        else:
            abstract_lines += line
    
    return abstract_samples

In [9]:
# Getting and preprocessing data
train_samples = preprocess_text_with_line_numbers(data_dir + "train.txt")
val_samples = preprocess_text_with_line_numbers(data_dir + "dev.txt")
test_samples = preprocess_text_with_line_numbers(data_dir + "test.txt")
print(len(train_samples), len(val_samples), len(test_samples))

180040 30212 30135


In [11]:
# Checking first abstract of train samples
train_samples[:14]

[{'target': 'OBJECTIVE',
  'text': 'to investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( oa ) .',
  'line_number': 0,
  'total_lines': 11},
 {'target': 'METHODS',
  'text': 'a total of @ patients with primary knee oa were randomized @:@ ; @ received @ mg/day of prednisolone and @ received placebo for @ weeks .',
  'line_number': 1,
  'total_lines': 11},
 {'target': 'METHODS',
  'text': 'outcome measures included pain reduction and improvement in function scores and systemic inflammation markers .',
  'line_number': 2,
  'total_lines': 11},
 {'target': 'METHODS',
  'text': 'pain was assessed using the visual analog pain scale ( @-@ mm ) .',
  'line_number': 3,
  'total_lines': 11},
 {'target': 'METHODS',
  'text': 'secondary outcome measures included the western ontari