In [1]:
import pandas as pd
import datasets
from datasets import Dataset, load_dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast,TrainingArguments, Trainer
import torch
from tqdm import tqdm
import pickle
import numpy as np
import utils
%load_ext autoreload
%autoreload 2

utils.seed_everything()

[nltk_data] Downloading package punkt to /home/jbdlb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
%load_ext tensorboard

In [3]:
dataset=load_dataset("csv", data_files={"train":"../Journal-Finder/train.csv",
                                        "valid":"../Journal-Finder/validation.csv",
                                        "test":"../Journal-Finder/test.csv"})

# The validation set is not given

Using custom data configuration default-5b03ef3722775437
Reusing dataset csv (/home/jbdlb/.cache/huggingface/datasets/csv/default-5b03ef3722775437/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [4]:
if torch.cuda.is_available():        # use the GPU.    
    device = torch.device("cuda")    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [5]:
# define an input as <article_title> : <article_abstract>
dataset=dataset.map(lambda x: {"input":x["title"]+" : "+x["abstract"]},remove_columns=["abstract","title"])

Loading cached processed dataset at /home/jbdlb/.cache/huggingface/datasets/csv/default-5b03ef3722775437/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-7c0ae1447394e741.arrow
Loading cached processed dataset at /home/jbdlb/.cache/huggingface/datasets/csv/default-5b03ef3722775437/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-704263e8686b6139.arrow
Loading cached processed dataset at /home/jbdlb/.cache/huggingface/datasets/csv/default-5b03ef3722775437/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-6d5c6f797e87293d.arrow


In [6]:
#mapping labels to journal_id
labels_jid_map = pd.DataFrame(dataset["train"]["journal_id"],dataset["train"]["labels"])
labels_jid_map=labels_jid_map.groupby(0, group_keys=False).apply(lambda df: df.sample(1))
labels_jid_map=labels_jid_map.to_dict()[0]
num_labels=len(labels_jid_map)


In [7]:
#mapping journal_id to journal_name
with open('../Journal-Finder/mapping_id_journal_name.pickle', 'rb') as handle:
    jid_jname_map = pickle.load(handle)

In [8]:
scilitbert_tokenizer = RobertaTokenizerFast(
    "../ScilitBERT/ScilitBERT_tokenizer/scilitBERT_tok-vocab.json",
    "../ScilitBERT/ScilitBERT_tokenizer/scilitBERT_tok-merges.txt",
    max_len=512
)


In [9]:
# load the model
ScilitBERT = RobertaForSequenceClassification.from_pretrained("../ScilitBERT/ScilitBERT_cased",num_labels=num_labels)

Some weights of the model checkpoint at ../ScilitBERT/ScilitBERT_cased were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../ScilitBERT/ScilitBERT_cased and are newly initialized: ['classifier.dense.weight', 'classifie

In [10]:
trainer, preds = utils.fine_tune(scilitbert_tokenizer,ScilitBERT,dataset,"fine_tuning_demo_journal_finder",output_dir="../results/fine_tuning_demo_journal_finder/",log_dir="../logs/fine_tuning_demo_journal_finder",num_labels=num_labels)

Loading cached processed dataset at /home/jbdlb/.cache/huggingface/datasets/csv/default-5b03ef3722775437/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-d00f590d94f54987.arrow
Loading cached processed dataset at /home/jbdlb/.cache/huggingface/datasets/csv/default-5b03ef3722775437/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-af1c3e7aa2c9da9a.arrow
Loading cached processed dataset at /home/jbdlb/.cache/huggingface/datasets/csv/default-5b03ef3722775437/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-194cbcf3655cfbca.arrow
  0%|          | 100/73340 [01:34<19:14:50,  1.06it/s]

{'loss': 4.5619, 'learning_rate': 2.9979533360622187e-05, 'epoch': 0.0}


  0%|          | 200/73340 [03:09<19:13:27,  1.06it/s]

{'loss': 3.6587, 'learning_rate': 2.993860008186656e-05, 'epoch': 0.0}


  0%|          | 300/73340 [04:44<19:16:44,  1.05it/s]

{'loss': 3.384, 'learning_rate': 2.989766680311093e-05, 'epoch': 0.0}


  1%|          | 400/73340 [06:18<19:02:38,  1.06it/s]

{'loss': 3.2515, 'learning_rate': 2.9856733524355304e-05, 'epoch': 0.01}


  1%|          | 500/73340 [07:52<18:59:13,  1.07it/s]

{'loss': 3.0769, 'learning_rate': 2.9815800245599673e-05, 'epoch': 0.01}


  1%|          | 600/73340 [09:27<18:58:32,  1.06it/s]

{'loss': 3.0276, 'learning_rate': 2.9774866966844045e-05, 'epoch': 0.01}


  1%|          | 700/73340 [11:02<18:57:10,  1.06it/s]

{'loss': 2.9505, 'learning_rate': 2.9733933688088418e-05, 'epoch': 0.01}


  1%|          | 800/73340 [12:36<18:54:41,  1.07it/s]

{'loss': 2.7945, 'learning_rate': 2.969300040933279e-05, 'epoch': 0.01}


  1%|          | 900/73340 [14:10<18:58:30,  1.06it/s]

{'loss': 2.6154, 'learning_rate': 2.965206713057716e-05, 'epoch': 0.01}


  1%|▏         | 939/73340 [14:47<18:55:44,  1.06it/s]