In [1]:
from collections import defaultdict, Counter
import json
from matplotlib import pyplot as plt
import numpy as np
import torch
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from sklearn.metrics.pairwise import cosine_similarity
import os
from datasets import DatasetDict, load_dataset, Dataset
import sys
import platform
import sklearn as sk

has_gpu = torch.cuda.is_available()
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("NVIDIA/CUDA GPU is", "available" if has_gpu else "NOT AVAILABLE")
print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
print(f"Target device is {device}")


path_to_data_folder="/Users/matteom/shared-folder/nlpt_group/project/data"

Python Platform: macOS-14.1.1-arm64-arm-64bit
PyTorch Version: 2.1.2

Python 3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 16:35:41) 
[Clang 16.0.6 ]
Pandas 2.1.4
Scikit-Learn 1.3.2
NVIDIA/CUDA GPU is NOT AVAILABLE
MPS (Apple Metal) is AVAILABLE
Target device is mps


In [2]:
# Read in abstracts
df_part1 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part1.csv"))
df_part2 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part2.csv"))
df=pd.concat([df_part1,df_part2])
abstracts = df[['Abstract']] #.tolist()
print(abstracts[:3])

                                            Abstract
0  SUMMARY Several lines of evidence support the ...
1  Acute inflammation is a severe medical conditi...
2  Human brain connectivity can be studied using ...


In [19]:
abstracts = abstracts.dropna()
abstracts_dataset = Dataset.from_pandas(abstracts,preserve_index=False)
abstracts_dataset.set_format("torch")
abstracts_dataset

Dataset({
    features: ['Abstract'],
    num_rows: 58849
})

In [20]:
# Load pretrained tokenizer for BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

In [21]:
split=int(np.floor(0.7*len(abstracts)))

# Take 70% of abstracts for train and 30% for validation
abstracts_dataset_dict = DatasetDict(
    train=abstracts_dataset.shuffle(seed=19).select(range(split)),
    val=abstracts_dataset.shuffle(seed=19).select(range(split, len(abstracts))),
)
abstracts_dataset_dict.column_names

{'train': ['Abstract'], 'val': ['Abstract']}

In [22]:
abstracts_dataset_dict['train'][:3]

{'Abstract': ['Radiomics is a newcomer field that has opened new windows for precision medicine It is related to extraction of a large number of quantitative features from medical images which may be difficult to detect visually Underlying tumor biology can change physical properties of tissues which affect patterns of image pixels and radiomics features The main advantage of radiomics is that it can characterize the whole tumor noninvasively even after a single sampling from an image Therefore it can be linked to a digital biopsy Physicians need to know about radiomics features to determine how their values correlate with the appearance of lesions and diseases Indeed physicians need practical references to conceive of basics and concepts of each radiomics feature without knowing their sophisticated mathematical formulas In this review commonly used radiomics features are illustrated with practical examples to help physicians in their routine diagnostic procedures',
  'Lignocellulosic 

In [23]:
tokenized_abstracts = abstracts_dataset_dict.map(
    lambda example: tokenizer(example['Abstract'], return_tensors="pt", truncation=True, padding=True, max_length=512),
    batched=True,  #so the function is applied to multiple elements of our dataset at once, and not on each element separately,
    batch_size=16,
    num_proc=4# number of workers
)
tokenized_abstracts

Map (num_proc=4):   0%|          | 0/41194 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/17655 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Abstract', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 41194
    })
    val: Dataset({
        features: ['Abstract', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17655
    })
})

In [24]:
#train_dataloader = DataLoader(tokenized_abstracts['train'], batch_size=16)
#eval_dataloader = DataLoader(tokenized_abstracts['val'], batch_size=16)

In [29]:
# Load pretrained BioBert model
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

In [26]:
def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}

In [30]:
arguments = TrainingArguments(
    output_dir="abstracts_trainer", #where to save the logs and checkpoints
    per_device_train_batch_size=8,# batch size per GPU or CPU
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    evaluation_strategy="epoch",# steps" (evaluate every eval_steps) or "epoch" (evaluate at the end of each epoch)
    save_strategy="epoch", # save the model at the end of each epoch
    learning_rate=2e-5,
    load_best_model_at_end=True, # the best model based on the metric
    seed=224,
    logging_steps=10
)

In [36]:
trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=tokenized_abstracts['train'],
    eval_dataset=tokenized_abstracts['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

In [37]:
trainer.train()

  0%|          | 0/103000 [00:00<?, ?it/s]

AttributeError: 'list' object has no attribute 'items'