# Interview task

In [53]:
RANDOM_STATE = 20
MODEL_NAME = "microsoft/deberta-large-mnli"

## check GPU

In [54]:
import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')

device: cuda:0


## Check dataset

In [55]:
import pandas as pd

In [56]:
df = pd.read_csv('substance_interactions.csv')

In [57]:
df.head()

Unnamed: 0,PREDICATION_ID,PMID,PREDICATE,INDICATOR_TYPE,PREDICATE_START_INDEX,PREDICATE_END_INDEX,SUBJECT_TEXT,SUBJECT_SEMTYPE,SUBJECT_START_INDEX,SUBJECT_END_INDEX,...,OBJECT_START_INDEX,OBJECT_END_INDEX,OBJECT_SCORE,OBJECT_DIST,OBJECT_MAXDIST,OBJECT_CUI,OBJECT_NOVELTY,TYPE,SENTENCE,LABEL
0,P3100,6499897,INTERACTS_WITH,NOM,1298,1304,SA,orch,1235,1237,...,1329,1332,1000,2,2,C0004057,1,ab,"Nor did administration of SA, diflunisal or AS...",n
1,P3101,8369307,INHIBITS,VERB,890,899,rHF,aapp,785,788,...,912,919,888,1,15,C0242417,1,ab,A comparative study of recombinant L-cha...,n
2,P3102,3711333,INHIBITS,VERB,1527,1534,alkaloids,orch,1508,1517,...,1541,1550,1000,1,1,C0003805,1,ab,These findings suggest that some nicotinic alk...,y
3,P3103,11742534,INTERACTS_WITH,NOM,746,753,amino acids,aapp,703,714,...,741,745,694,0,4,C0169658|3716,1,ab,With a truncated chimaeric IL-5Rbeta-gp1...,y
4,P3104,244385,STIMULATES,ADJ,410,419,Neutral endopeptidase,aapp,374,401,...,480,491,1000,3,5,C0039815,1,ab,"Neutral endopeptidase, a zinc-dependent ...",n


In [58]:
df.columns

Index(['PREDICATION_ID', 'PMID', 'PREDICATE', 'INDICATOR_TYPE',
       'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'SUBJECT_TEXT',
       'SUBJECT_SEMTYPE', 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX',
       'SUBJECT_SCORE', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 'SUBJECT_CUI',
       'SUBJECT_NOVELTY', 'OBJECT_TEXT', 'OBJECT_SEMTYPE',
       'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE', 'OBJECT_DIST',
       'OBJECT_MAXDIST', 'OBJECT_CUI', 'OBJECT_NOVELTY', 'TYPE', 'SENTENCE',
       'LABEL'],
      dtype='object')

In [59]:
def pre_processing(example):
    sentence = example['SENTENCE']
    subject = example['SUBJECT_TEXT']
    object = example['OBJECT_TEXT']
    relation = example['PREDICATE']
    # text = f"{subject} [SEP] {relation} [SEP] {object} [SEP] {sentence}"
    text = f"{sentence} [SEP] {subject} , {relation} , {object}"
    return text

df['triple_with_sentence'] = df.apply(pre_processing,axis=1)
df.head()

Unnamed: 0,PREDICATION_ID,PMID,PREDICATE,INDICATOR_TYPE,PREDICATE_START_INDEX,PREDICATE_END_INDEX,SUBJECT_TEXT,SUBJECT_SEMTYPE,SUBJECT_START_INDEX,SUBJECT_END_INDEX,...,OBJECT_END_INDEX,OBJECT_SCORE,OBJECT_DIST,OBJECT_MAXDIST,OBJECT_CUI,OBJECT_NOVELTY,TYPE,SENTENCE,LABEL,triple_with_sentence
0,P3100,6499897,INTERACTS_WITH,NOM,1298,1304,SA,orch,1235,1237,...,1332,1000,2,2,C0004057,1,ab,"Nor did administration of SA, diflunisal or AS...",n,"Nor did administration of SA, diflunisal or AS..."
1,P3101,8369307,INHIBITS,VERB,890,899,rHF,aapp,785,788,...,919,888,1,15,C0242417,1,ab,A comparative study of recombinant L-cha...,n,A comparative study of recombinant L-cha...
2,P3102,3711333,INHIBITS,VERB,1527,1534,alkaloids,orch,1508,1517,...,1550,1000,1,1,C0003805,1,ab,These findings suggest that some nicotinic alk...,y,These findings suggest that some nicotinic alk...
3,P3103,11742534,INTERACTS_WITH,NOM,746,753,amino acids,aapp,703,714,...,745,694,0,4,C0169658|3716,1,ab,With a truncated chimaeric IL-5Rbeta-gp1...,y,With a truncated chimaeric IL-5Rbeta-gp1...
4,P3104,244385,STIMULATES,ADJ,410,419,Neutral endopeptidase,aapp,374,401,...,491,1000,3,5,C0039815,1,ab,"Neutral endopeptidase, a zinc-dependent ...",n,"Neutral endopeptidase, a zinc-dependent ..."


## Tokenizer

In [60]:

from transformers import AutoTokenizer

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


Loading tokenizer...


#### test tokenizer for a triple with corresponding sentence

In [61]:
example = df['triple_with_sentence'].iloc[0]
example

'Nor did administration of SA, diflunisal or ASA itself impair the       anti-aggregatory effect of a fresh test dose of ASA. [SEP] SA , INTERACTS_WITH , ASA'

In [62]:
tokenizer(example, return_tensors='pt', truncation=True)

{'input_ids': tensor([[    1, 29723,   222,   942,     9,  5531,     6,   385,  1594,   462,
           879, 26860,    50, 36356,  1495, 29210,     5,  1437,  1437,  1437,
          1437,  1437,  1437,  1475,    12,  7165,  4950,  5257,  1683,     9,
            10,  2310,  1296, 12234,     9, 36356,     4,  1437,     2,  5531,
          2156, 20281,  2562,  2685,  1215,   771, 27698,  2156, 36356,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}

In [63]:
def processing(example):
    res = tokenizer(example['triple_with_sentence'])
    # res['label'] = example['LABEL']
    res['label'] = 1 if example['LABEL']=='y' else 0
    return res
df['data'] = df.apply(processing, axis=1)

In [64]:
df['data'][0]

{'input_ids': [1, 29723, 222, 942, 9, 5531, 6, 385, 1594, 462, 879, 26860, 50, 36356, 1495, 29210, 5, 1437, 1437, 1437, 1437, 1437, 1437, 1475, 12, 7165, 4950, 5257, 1683, 9, 10, 2310, 1296, 12234, 9, 36356, 4, 1437, 2, 5531, 2156, 20281, 2562, 2685, 1215, 771, 27698, 2156, 36356, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0}

In [65]:
df.columns

Index(['PREDICATION_ID', 'PMID', 'PREDICATE', 'INDICATOR_TYPE',
       'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'SUBJECT_TEXT',
       'SUBJECT_SEMTYPE', 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX',
       'SUBJECT_SCORE', 'SUBJECT_DIST', 'SUBJECT_MAXDIST', 'SUBJECT_CUI',
       'SUBJECT_NOVELTY', 'OBJECT_TEXT', 'OBJECT_SEMTYPE',
       'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE', 'OBJECT_DIST',
       'OBJECT_MAXDIST', 'OBJECT_CUI', 'OBJECT_NOVELTY', 'TYPE', 'SENTENCE',
       'LABEL', 'triple_with_sentence', 'data'],
      dtype='object')

## split the data, training set 70%, validation set 15%, test set 15%

In [66]:
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(df, test_size=0.3, random_state=RANDOM_STATE)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=RANDOM_STATE)

In [67]:
print(len(train_data),len(val_data),len(test_data))

2100 450 450


In [68]:
train_data = train_data.reset_index()
val_data = val_data.reset_index()
test_data = test_data.reset_index()

In [69]:
from transformers import DataCollatorWithPadding
# import evaluate

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# accuracy = evaluate.load('accuracy')

In [70]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


def compute_metrics(eval_pred):
	    predictions, labels = eval_pred
	    predictions = np.argmax(predictions, axis=1)
	    
	    # Calculate precision, recall, and F1 score
	    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
	    
	    return {
	        'accuracy': accuracy_score(labels, predictions),
	        'precision': precision,
	        'recall': recall,
	        'f1': f1
	    }

## BERT model

In [71]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

labels = ['n', 'y']
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items()}

print('id2label:', id2label)
print('label2id:', label2id)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

# Only train last classifier layer
# for param in model.base_model.parameters():
#     param.requires_grad = False
# resize model embedding to match new tokenizer
# model.resize_token_embeddings(len(tokenizer))

# # fix model padding token id
# model.config.pad_token_id = model.config.eos_token_id

id2label: {0: 'n', 1: 'y'}
label2id: {'n': 0, 'y': 1}


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large-mnli and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size(

In [72]:
model

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
   

In [73]:
# Freeze all layers except the last one
for param in model.base_model.parameters():
    param.requires_grad = False

# for param in model.bert.pooler.dense.parameters():
#     param.requires_grad = True

# # Unfreeze the last three layers
# for param in model.transformer.ln_f.parameters():
#     param.requires_grad = True

for param in model.parameters():
    print(param.requires_grad)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

##  Training

In [74]:
training_args = TrainingArguments(
    output_dir='my_best_model',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

In [75]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data['data'],
    eval_dataset=val_data['data'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.617111,0.671111,0.649351,0.833333,0.729927
2,No log,0.59813,0.684444,0.670139,0.804167,0.731061
3,No log,0.593626,0.677778,0.657807,0.825,0.731978
4,No log,0.593252,0.682222,0.656958,0.845833,0.739526
5,No log,0.606164,0.68,0.642857,0.9,0.75
6,No log,0.592966,0.691111,0.660317,0.866667,0.74955
7,No log,0.583731,0.677778,0.655738,0.833333,0.733945
8,0.611200,0.579852,0.68,0.661074,0.820833,0.732342
9,0.611200,0.585483,0.688889,0.660256,0.858333,0.746377
10,0.611200,0.584271,0.686667,0.660194,0.85,0.743169


Checkpoint destination directory my_best_model/checkpoint-66 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_best_model/checkpoint-132 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_best_model/checkpoint-198 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_best_model/checkpoint-264 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_best_model/checkpoint-330 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_best_model/checkpoint-396 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_best_model/checkpoint-462 already exists and is non-empty. Saving will 

TrainOutput(global_step=660, training_loss=0.6058517456054687, metrics={'train_runtime': 334.8742, 'train_samples_per_second': 62.71, 'train_steps_per_second': 1.971, 'total_flos': 7526117193318096.0, 'train_loss': 0.6058517456054687, 'epoch': 10.0})

In [76]:
trainer.evaluate(train_data['data'])

{'eval_loss': 0.5725005865097046,
 'eval_accuracy': 0.6976190476190476,
 'eval_precision': 0.6802030456852792,
 'eval_recall': 0.8286219081272085,
 'eval_f1': 0.7471127041019514,
 'eval_runtime': 22.8023,
 'eval_samples_per_second': 92.096,
 'eval_steps_per_second': 2.894,
 'epoch': 10.0}

In [77]:
trainer.evaluate(val_data['data'])

{'eval_loss': 0.5798524022102356,
 'eval_accuracy': 0.68,
 'eval_precision': 0.6610738255033557,
 'eval_recall': 0.8208333333333333,
 'eval_f1': 0.7323420074349443,
 'eval_runtime': 4.3727,
 'eval_samples_per_second': 102.911,
 'eval_steps_per_second': 3.43,
 'epoch': 10.0}

In [78]:
trainer.evaluate(test_data['data'])

{'eval_loss': 0.5823975801467896,
 'eval_accuracy': 0.6755555555555556,
 'eval_precision': 0.6505190311418685,
 'eval_recall': 0.8068669527896996,
 'eval_f1': 0.7203065134099617,
 'eval_runtime': 4.1466,
 'eval_samples_per_second': 108.522,
 'eval_steps_per_second': 3.617,
 'epoch': 10.0}