In [1]:
from collections import defaultdict, Counter
import json
from matplotlib import pyplot as plt
import numpy as np
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import os
from datasets import DatasetDict, load_dataset, Dataset


path_to_data_folder="/Users/matteom/shared-folder/nlpt_group/project/data"

In [2]:
# Read in abstracts
df_part1 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part1.csv"))
df_part2 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part2.csv"))
df=pd.concat([df_part1,df_part2])
abstracts = df[['Abstract']] #.tolist()
print(abstracts[:3])

                                            Abstract
0  SUMMARY Several lines of evidence support the ...
1  Acute inflammation is a severe medical conditi...
2  Human brain connectivity can be studied using ...


In [3]:
abstracts = abstracts.dropna()
abstracts_dataset = Dataset.from_pandas(abstracts)
abstracts_dataset.set_format("torch")
abstracts_dataset

Dataset({
    features: ['Abstract', '__index_level_0__'],
    num_rows: 58849
})

In [4]:
# Load pretrained tokenizer for BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

In [5]:
split=int(np.floor(0.7*len(abstracts)))

# Take 70% of abstracts for train and 30% for validation
abstracts_dataset_dict = DatasetDict(
    train=abstracts_dataset.shuffle(seed=19).select(range(split)),
    val=abstracts_dataset.shuffle(seed=19).select(range(split, len(abstracts))),
)
abstracts_dataset_dict.column_names

{'train': ['Abstract', '__index_level_0__'],
 'val': ['Abstract', '__index_level_0__']}

In [6]:
abstracts_dataset_dict['train'][:3]

{'Abstract': ['Radiomics is a newcomer field that has opened new windows for precision medicine It is related to extraction of a large number of quantitative features from medical images which may be difficult to detect visually Underlying tumor biology can change physical properties of tissues which affect patterns of image pixels and radiomics features The main advantage of radiomics is that it can characterize the whole tumor noninvasively even after a single sampling from an image Therefore it can be linked to a digital biopsy Physicians need to know about radiomics features to determine how their values correlate with the appearance of lesions and diseases Indeed physicians need practical references to conceive of basics and concepts of each radiomics feature without knowing their sophisticated mathematical formulas In this review commonly used radiomics features are illustrated with practical examples to help physicians in their routine diagnostic procedures',
  'Lignocellulosic 

In [7]:
tokenized_abstracts = abstracts_dataset_dict.map(
    lambda example: tokenizer(example['Abstract'], return_tensors="pt", truncation=True, padding=True, max_length=512),
    batched=True,  #so the function is applied to multiple elements of our dataset at once, and not on each element separately,
    batch_size=16,
    num_proc=4# number of workers
)
tokenized_abstracts

Map (num_proc=4):   0%|          | 0/41194 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/17655 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Abstract', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 41194
    })
    val: Dataset({
        features: ['Abstract', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17655
    })
})

In [8]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_abstracts['train'], batch_size=16)
eval_dataloader = DataLoader(tokenized_abstracts['val'], batch_size=16)

In [None]:
from transformers import TrainingArguments, Trainer

# Load pretrained BioBert model
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

arguments = TrainingArguments(
    output_dir="abstracts_trainer", #where to save the logs and checkpoints
    per_device_train_batch_size=16,# batch size per GPU or CPU
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    evaluation_strategy="epoch",# steps" (evaluate every eval_steps) or "epoch" (evaluate at the end of each epoch)
    save_strategy="epoch", # save the model at the end of each epoch
    learning_rate=2e-5,
    load_best_model_at_end=True, # the best model based on the metric
    seed=224,
    logging_steps=10
)

# I got crash error:
# The Kernel crashed while executing code in the the current cell or a previous cell.
# It happens when I call the import

In [None]:
def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}


In [None]:
trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=abstracts_dataset_dict['train'],
    eval_dataset=abstracts_dataset_dict['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()