# Interview task

In [1]:
RANDOM_STATE = 20
MODEL_NAME = "google-t5/t5-small"

## check GPU

In [2]:
import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')

device: cuda:0


## Check dataset

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('substance_interactions.csv')

In [5]:
df.head()

Unnamed: 0,PREDICATION_ID,PMID,PREDICATE,INDICATOR_TYPE,PREDICATE_START_INDEX,PREDICATE_END_INDEX,SUBJECT_TEXT,SUBJECT_SEMTYPE,SUBJECT_START_INDEX,SUBJECT_END_INDEX,...,OBJECT_START_INDEX,OBJECT_END_INDEX,OBJECT_SCORE,OBJECT_DIST,OBJECT_MAXDIST,OBJECT_CUI,OBJECT_NOVELTY,TYPE,SENTENCE,LABEL
0,P3100,6499897,INTERACTS_WITH,NOM,1298,1304,SA,orch,1235,1237,...,1329,1332,1000,2,2,C0004057,1,ab,"Nor did administration of SA, diflunisal or AS...",n
1,P3101,8369307,INHIBITS,VERB,890,899,rHF,aapp,785,788,...,912,919,888,1,15,C0242417,1,ab,A comparative study of recombinant L-cha...,n
2,P3102,3711333,INHIBITS,VERB,1527,1534,alkaloids,orch,1508,1517,...,1541,1550,1000,1,1,C0003805,1,ab,These findings suggest that some nicotinic alk...,y
3,P3103,11742534,INTERACTS_WITH,NOM,746,753,amino acids,aapp,703,714,...,741,745,694,0,4,C0169658|3716,1,ab,With a truncated chimaeric IL-5Rbeta-gp1...,y
4,P3104,244385,STIMULATES,ADJ,410,419,Neutral endopeptidase,aapp,374,401,...,480,491,1000,3,5,C0039815,1,ab,"Neutral endopeptidase, a zinc-dependent ...",n


In [6]:
df.columns

Index(['PREDICATION_ID', 'PMID', 'PREDICATE', 'INDICATOR_TYPE',
       'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'SUBJECT_TEXT',
       'SUBJECT_SEMTYPE', 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX',
       'SUBJECT_SCORE', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 'SUBJECT_CUI',
       'SUBJECT_NOVELTY', 'OBJECT_TEXT', 'OBJECT_SEMTYPE',
       'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE', 'OBJECT_DIST',
       'OBJECT_MAXDIST', 'OBJECT_CUI', 'OBJECT_NOVELTY', 'TYPE', 'SENTENCE',
       'LABEL'],
      dtype='object')

In [7]:
def pre_processing(example):
    sentence = example['SENTENCE']
    subject = example['SUBJECT_TEXT']
    object = example['OBJECT_TEXT']
    relation = example['PREDICATE']
    # text = f"{subject} [SEP] {relation} [SEP] {object} [SEP] {sentence}"
    text = f"{sentence} [SEP] {subject} , {relation} , {object}"
    return text

df['triple_with_sentence'] = df.apply(pre_processing,axis=1)
df.head()

Unnamed: 0,PREDICATION_ID,PMID,PREDICATE,INDICATOR_TYPE,PREDICATE_START_INDEX,PREDICATE_END_INDEX,SUBJECT_TEXT,SUBJECT_SEMTYPE,SUBJECT_START_INDEX,SUBJECT_END_INDEX,...,OBJECT_END_INDEX,OBJECT_SCORE,OBJECT_DIST,OBJECT_MAXDIST,OBJECT_CUI,OBJECT_NOVELTY,TYPE,SENTENCE,LABEL,triple_with_sentence
0,P3100,6499897,INTERACTS_WITH,NOM,1298,1304,SA,orch,1235,1237,...,1332,1000,2,2,C0004057,1,ab,"Nor did administration of SA, diflunisal or AS...",n,"Nor did administration of SA, diflunisal or AS..."
1,P3101,8369307,INHIBITS,VERB,890,899,rHF,aapp,785,788,...,919,888,1,15,C0242417,1,ab,A comparative study of recombinant L-cha...,n,A comparative study of recombinant L-cha...
2,P3102,3711333,INHIBITS,VERB,1527,1534,alkaloids,orch,1508,1517,...,1550,1000,1,1,C0003805,1,ab,These findings suggest that some nicotinic alk...,y,These findings suggest that some nicotinic alk...
3,P3103,11742534,INTERACTS_WITH,NOM,746,753,amino acids,aapp,703,714,...,745,694,0,4,C0169658|3716,1,ab,With a truncated chimaeric IL-5Rbeta-gp1...,y,With a truncated chimaeric IL-5Rbeta-gp1...
4,P3104,244385,STIMULATES,ADJ,410,419,Neutral endopeptidase,aapp,374,401,...,491,1000,3,5,C0039815,1,ab,"Neutral endopeptidase, a zinc-dependent ...",n,"Neutral endopeptidase, a zinc-dependent ..."


## Tokenizer

In [8]:

from transformers import AutoTokenizer

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  from .autonotebook import tqdm as notebook_tqdm


Loading tokenizer...


#### test tokenizer for a triple with corresponding sentence

In [9]:
example = df['triple_with_sentence'].iloc[0]
example

'Nor did administration of SA, diflunisal or ASA itself impair the       anti-aggregatory effect of a fresh test dose of ASA. [SEP] SA , INTERACTS_WITH , ASA'

In [10]:
tokenizer(example, return_tensors='pt', truncation=True)

{'input_ids': tensor([[ 7005,   410,  3602,    13,  4646,     6,  1227,  6947,    29,   159,
           138,    42,     3, 21245,  1402,  4840,  2256,     8,  1181,    18,
         31761,   127,    63,  1504,    13,     3,     9,  1434,   794,  6742,
            13,     3, 21245,     5,   784,   134,  8569,   908,  4646,     3,
             6,     3, 21342, 22034,  4578,   834, 16785,  4611,     3,     6,
             3, 21245,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}

In [11]:
def processing(example):
    res = tokenizer(example['triple_with_sentence'])
    # res['label'] = example['LABEL']
    res['label'] = 1 if example['LABEL']=='y' else 0
    return res
df['data'] = df.apply(processing, axis=1)

In [12]:
df['data'][0]

{'input_ids': [7005, 410, 3602, 13, 4646, 6, 1227, 6947, 29, 159, 138, 42, 3, 21245, 1402, 4840, 2256, 8, 1181, 18, 31761, 127, 63, 1504, 13, 3, 9, 1434, 794, 6742, 13, 3, 21245, 5, 784, 134, 8569, 908, 4646, 3, 6, 3, 21342, 22034, 4578, 834, 16785, 4611, 3, 6, 3, 21245, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0}

In [13]:
df.columns

Index(['PREDICATION_ID', 'PMID', 'PREDICATE', 'INDICATOR_TYPE',
       'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'SUBJECT_TEXT',
       'SUBJECT_SEMTYPE', 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX',
       'SUBJECT_SCORE', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 'SUBJECT_CUI',
       'SUBJECT_NOVELTY', 'OBJECT_TEXT', 'OBJECT_SEMTYPE',
       'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE', 'OBJECT_DIST',
       'OBJECT_MAXDIST', 'OBJECT_CUI', 'OBJECT_NOVELTY', 'TYPE', 'SENTENCE',
       'LABEL', 'triple_with_sentence', 'data'],
      dtype='object')

## split the data, training set 70%, validation set 15%, test set 15%

In [14]:
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(df, test_size=0.3, random_state=RANDOM_STATE)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=RANDOM_STATE)

In [15]:
print(len(train_data),len(val_data),len(test_data))

2100 450 450


In [16]:
train_data = train_data.reset_index()
val_data = val_data.reset_index()
test_data = test_data.reset_index()

In [17]:
from transformers import DataCollatorWithPadding
# import evaluate

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# accuracy = evaluate.load('accuracy')

In [18]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


def compute_metrics(eval_pred):
	    predictions, labels = eval_pred
	    predictions = np.argmax(predictions, axis=1)
	    
	    # Calculate precision, recall, and F1 score
	    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
	    
	    return {
	        'accuracy': accuracy_score(labels, predictions),
	        'precision': precision,
	        'recall': recall,
	        'f1': f1
	    }

## BERT model

In [19]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import CTRLForSequenceClassification

labels = ['n', 'y']
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items()}

print('id2label:', id2label)
print('label2id:', label2id)

model = CTRLForSequenceClassification.from_pretrained(MODEL_NAME, ignore_mismatched_sizes=True, num_labels=len(labels), id2label=id2label, label2id=label2id)

# Only train last classifier layer
# for param in model.base_model.parameters():
#     param.requires_grad = False
# resize model embedding to match new tokenizer
# model.resize_token_embeddings(len(tokenizer))

# # fix model padding token id
# model.config.pad_token_id = model.config.eos_token_id

You are using a model of type t5 to instantiate a model of type ctrl. This is not supported for all configurations of models and can yield errors.


id2label: {0: 'n', 1: 'y'}
label2id: {'n': 0, 'y': 1}


Some weights of CTRLForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classifier.weight', 'h.0.ffn.0.bias', 'h.0.ffn.0.weight', 'h.0.ffn.2.bias', 'h.0.ffn.2.weight', 'h.0.layernorm1.bias', 'h.0.layernorm1.weight', 'h.0.layernorm2.bias', 'h.0.layernorm2.weight', 'h.0.multi_head_attention.Wk.bias', 'h.0.multi_head_attention.Wk.weight', 'h.0.multi_head_attention.Wq.bias', 'h.0.multi_head_attention.Wq.weight', 'h.0.multi_head_attention.Wv.bias', 'h.0.multi_head_attention.Wv.weight', 'h.0.multi_head_attention.dense.bias', 'h.0.multi_head_attention.dense.weight', 'h.1.ffn.0.bias', 'h.1.ffn.0.weight', 'h.1.ffn.2.bias', 'h.1.ffn.2.weight', 'h.1.layernorm1.bias', 'h.1.layernorm1.weight', 'h.1.layernorm2.bias', 'h.1.layernorm2.weight', 'h.1.multi_head_attention.Wk.bias', 'h.1.multi_head_attention.Wk.weight', 'h.1.multi_head_attention.Wq.bias', 'h.1.multi_head_attention.Wq.weight', 'h.1.multi_head_attention.Wv.bias', 'h.1.m

In [20]:
model

CTRLForSequenceClassification(
  (transformer): CTRLModel(
    (w): Embedding(32128, 1280)
    (dropout): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x EncoderLayer(
        (multi_head_attention): MultiHeadAttention(
          (Wq): Linear(in_features=1280, out_features=1280, bias=True)
          (Wk): Linear(in_features=1280, out_features=1280, bias=True)
          (Wv): Linear(in_features=1280, out_features=1280, bias=True)
          (dense): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (ffn): Sequential(
          (0): Linear(in_features=1280, out_features=8192, bias=True)
          (1): ReLU()
          (2): Linear(in_features=8192, out_features=1280, bias=True)
        )
        (layernorm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (layernorm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )


In [21]:
# Freeze all layers except the last one
for param in model.base_model.parameters():
    param.requires_grad = False

# for param in model.bert.pooler.dense.parameters():
#     param.requires_grad = True

# # Unfreeze the last three layers
# for param in model.transformer.ln_f.parameters():
#     param.requires_grad = True

for param in model.parameters():
    print(param.requires_grad)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [22]:
sum(p.numel() for p in model.parameters())

1363280896

In [23]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

2560

##  Training

In [24]:
training_args = TrainingArguments(
    output_dir='my_best_model',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data['data'],
    eval_dataset=val_data['data'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.706951,0.533333,0.537313,0.9,0.672897
2,No log,0.698585,0.544444,0.544529,0.891667,0.676145
3,No log,0.694433,0.546667,0.546154,0.8875,0.67619
4,No log,0.692667,0.548889,0.547315,0.891667,0.678288
5,No log,0.691918,0.542222,0.54359,0.883333,0.673016
6,No log,0.687324,0.542222,0.545455,0.85,0.664495
7,No log,0.69044,0.54,0.542636,0.875,0.669856
8,0.722800,0.685457,0.548889,0.550409,0.841667,0.665568
9,0.722800,0.685279,0.548889,0.550409,0.841667,0.665568
10,0.722800,0.685112,0.548889,0.550409,0.841667,0.665568


Could not locate the best model at my_best_model/checkpoint-660/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=660, training_loss=0.7180439226555102, metrics={'train_runtime': 1108.113, 'train_samples_per_second': 18.951, 'train_steps_per_second': 0.596, 'total_flos': 2.680753818189005e+16, 'train_loss': 0.7180439226555102, 'epoch': 10.0})

In [26]:
trainer.evaluate(train_data['data'])

{'eval_loss': 0.6789799332618713,
 'eval_accuracy': 0.5652380952380952,
 'eval_precision': 0.5636257989540965,
 'eval_recall': 0.8568904593639576,
 'eval_f1': 0.6799859796705222,
 'eval_runtime': 60.4345,
 'eval_samples_per_second': 34.748,
 'eval_steps_per_second': 1.092,
 'epoch': 10.0}

In [27]:
trainer.evaluate(val_data['data'])

{'eval_loss': 0.6851117610931396,
 'eval_accuracy': 0.5488888888888889,
 'eval_precision': 0.5504087193460491,
 'eval_recall': 0.8416666666666667,
 'eval_f1': 0.6655683690280065,
 'eval_runtime': 11.9094,
 'eval_samples_per_second': 37.785,
 'eval_steps_per_second': 1.26,
 'epoch': 10.0}

In [28]:
trainer.evaluate(test_data['data'])

{'eval_loss': 0.6999768018722534,
 'eval_accuracy': 0.5311111111111111,
 'eval_precision': 0.5295698924731183,
 'eval_recall': 0.8454935622317596,
 'eval_f1': 0.6512396694214876,
 'eval_runtime': 11.0532,
 'eval_samples_per_second': 40.712,
 'eval_steps_per_second': 1.357,
 'epoch': 10.0}

In [29]:
!rm -r my_best_model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
