In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Init

In [2]:
import tensorflow as tf

In [3]:
from IPython.display import clear_output
!pip install transformers
clear_output()
import transformers
transformers.logging.set_verbosity_error()

In [4]:
from IPython.display import clear_output
!pip install ir_datasets
clear_output()

# Process Dataset

## Training

In [5]:
import ir_datasets
import pandas as pd
topic = ir_datasets.load("nfcorpus/train/nontopic")
doc = pd.DataFrame(topic.docs_iter()).set_index("doc_id").drop(["url", "title"], axis=1)
t_query = pd.DataFrame(topic.queries_iter()).set_index('query_id')
t_qrels = pd.DataFrame(topic.qrels_iter()).drop(["iteration"], axis=1)
video = ir_datasets.load("nfcorpus/train/video")
v_query = pd.DataFrame(video.queries_iter()).set_index('query_id')
v_qrels = pd.DataFrame(video.qrels_iter()).drop(["iteration"], axis=1)
train = {"topic":t_query, "video":v_query, "v_qrels":v_qrels, "t_qrels":t_qrels}

[INFO] If you have a local copy of https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/49c061fbadc52ba4d35d0e42e2d742fd
[INFO] [starting] https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz
[INFO] [finished] https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz: [00:00] [31.0MB] [82.2MB/s]


In [6]:
doc.head(2)

Unnamed: 0_level_0,abstract
doc_id,Unnamed: 1_level_1
MED-1,Abstract Background: Acrylamide is a common di...
MED-2,Abstract Human exposure to acrylamide (AA) thr...


In [7]:
train["video"].head(2)

Unnamed: 0_level_0,title,desc
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PLAIN-2427,heart of gold : turmeric vs. exercise,diet and exercise synergize to improve endothe...
PLAIN-2428,does fiber really prevent diverticulosis ?,the parable of the tiny parachute explains the...


In [8]:
train["t_qrels"].head(2)

Unnamed: 0,query_id,doc_id,relevance
0,PLAIN-3,MED-2436,3
1,PLAIN-3,MED-2437,3


## Validation

In [9]:
import ir_datasets
import pandas as pd
topic = ir_datasets.load("nfcorpus/dev/nontopic")
t_query = pd.DataFrame(topic.queries_iter()).set_index('query_id')
t_qrels = pd.DataFrame(topic.qrels_iter()).drop(["iteration"], axis=1)
video = ir_datasets.load("nfcorpus/dev/video")
v_query = pd.DataFrame(video.queries_iter()).set_index('query_id')
v_qrels = pd.DataFrame(video.qrels_iter()).drop(["iteration"], axis=1)
val = {"topic":t_query, "video":v_query, "v_qrels":v_qrels, "t_qrels":t_qrels}

In [10]:
val["topic"].head(2)

Unnamed: 0_level_0,text
query_id,Unnamed: 1_level_1
PLAIN-1,why deep fried foods may cause cancer
PLAIN-101,how to treat multiple sclerosis with diet


In [11]:
val["v_qrels"].head(2)

Unnamed: 0,query_id,doc_id,relevance
0,PLAIN-2429,MED-974,3
1,PLAIN-2429,MED-975,3


In [12]:
temp = train['topic']["text"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

6.042068361086766
6.0
24


In [13]:
temp = train['video']["title"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

5.541871921182266
5.0
14


In [14]:
temp = train['video']["desc"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

24.523399014778324
22.0
72


In [15]:
temp = doc["abstract"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

223.65909514056972
228.0
1461


# Tokenizer

In [16]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('allenai/biomed_roberta_base')
encoded_input = tokenizer("query", "doc", padding=True, truncation=True, max_length=320, return_tensors='tf')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/185 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/430 [00:00<?, ?B/s]

In [17]:
tokenizer.decode(encoded_input['input_ids'].numpy()[0])

'<s>query</s></s>doc</s>'

# Model

In [18]:
from transformers import TFRobertaModel
from keras.layers import Dropout, Dense
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy

class MonoRoberta(tf.keras.Model):

    def __init__(self, model_name, dropout_prob=0.3):
        super().__init__(name="reranker")
        self.roberta = TFRobertaModel.from_pretrained(model_name, from_pt=True)
        self.dropout = Dropout(dropout_prob)
        weight_initializer = tf.keras.initializers.GlorotNormal() 
        self.classifier = Dense(3, name="classifier", 
                                kernel_initializer = weight_initializer,  
                                bias_initializer = 'zeros')

    def call(self, inputs, **kwargs):
        # two outputs from BERT
        trained_roberta = self.roberta(inputs, **kwargs)
        pooled_output = trained_roberta.pooler_output
        
        # pooled_output for intent classification
        pooled_output = self.dropout(pooled_output,
                                     training=kwargs.get("training", False))
        logits = self.classifier(pooled_output)
        return logits

In [19]:
mono_roberta = MonoRoberta("allenai/biomed_roberta_base")

Downloading:   0%|          | 0.00/656M [00:00<?, ?B/s]

In [20]:
mono_roberta(encoded_input)

<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[ 0.30508298, -0.25781658,  0.17354578]], dtype=float32)>

In [21]:
# mono_roberta.load_weights(f"/content/drive/MyDrive/IR/mono roberta/checkpoint/19.h5")

In [22]:
def loss(model, x, y, training):
  # training=training is needed only if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  y_ = model(x, training=training)
  return loss_object(y_true=y, y_pred=y_)
def grad(model, inputs, targets):
  with tf.GradientTape() as tape:
    loss_value = loss(model, inputs, targets, training=True)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [23]:
import random
def get_random_data(data, doc):
  x = random.randint(1,3)
  if (x == 1):
    idx = random.randint(0, len(data["t_qrels"]) - 1)
    query = data["topic"].loc[data["t_qrels"].query_id[idx]].text
    doc = doc.loc[data["t_qrels"].doc_id[idx]].abstract
    y = data["t_qrels"].loc[idx].relevance
  elif (x == 2):
    idx = random.randint(0, len(data["v_qrels"]) - 1)
    query = data["video"].loc[data["v_qrels"].query_id[idx]].title
    doc = doc.loc[data["v_qrels"].doc_id[idx]].abstract
    y = data["v_qrels"].loc[idx].relevance
  elif (x == 3):
    idx = random.randint(0, len(data["v_qrels"]) - 1)
    query = data["video"].loc[data["v_qrels"].query_id[idx]].desc
    doc = doc.loc[data["v_qrels"].doc_id[idx]].abstract
    y = data["v_qrels"].loc[idx].relevance

  x = tokenizer(query, doc, padding="max_length", truncation=True, max_length=320, return_tensors='tf')
  return x, y

In [None]:
## Note: Rerunning this cell uses the same model variables
from tqdm import tqdm
import time
import random

# Keep results for plotting
optimizer = Adam(learning_rate=1e-5, epsilon=1e-08)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss_results = []
train_accuracy_results = []

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

max_epoch = 20
curr = 0.785
curr_epc = 22
for epoch in range(max_epoch):
  epc = curr_epc + epoch + 1
  # Training loop - using batches of 32
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

  with tqdm(range(1000), unit="batch") as tepoch:
    for i in tepoch:
      tepoch.set_description(f"Train {epc}")
      x, y = get_random_data(train, doc)
      y -= 1

      # Optimize the model
      loss_value, grads = grad(mono_roberta, x, y)
      optimizer.apply_gradients(zip(grads, mono_roberta.trainable_variables))

      # Track progress
      epoch_loss_avg.update_state(loss_value)  # Add current batch loss
      # Compare predicted label to actual label
      # training=True is needed only if there are layers with different
      # behavior during training versus inference (e.g. Dropout).
      epoch_accuracy.update_state(y, mono_roberta(x, training=True))
      tepoch.set_postfix(loss= epoch_loss_avg.result().numpy(), accuracy= epoch_accuracy.result().numpy())
      time.sleep(0.1)
  
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

  with tqdm(range(200), unit="batch") as tepoch:
    for i in tepoch:
      tepoch.set_description(f"Val {epc}")
      x, y = get_random_data(val, doc)
      y -= 1

      # Optimize the model
      loss_value, grads = grad(mono_roberta, x, y)

      # Track progress
      epoch_loss_avg.update_state(loss_value)  # Add current batch loss
      # Compare predicted label to actual label
      # training=True is needed only if there are layers with different
      # behavior during training versus inference (e.g. Dropout).
      epoch_accuracy.update_state(y, mono_roberta(x, training=True))
      tepoch.set_postfix(loss= epoch_loss_avg.result().numpy(), accuracy= epoch_accuracy.result().numpy())
      time.sleep(0.1)
  if (curr <  epoch_accuracy.result().numpy() or epc & 10 == 0):
    curr = epoch_accuracy.result().numpy()
    mono_roberta.save_weights(f"/content/drive/MyDrive/IR/mono roberta/checkpoint/v2_{epc}_{curr:.3f}.h5")
  mono_roberta.save_weights(f"/content/drive/MyDrive/IR/mono roberta/checkpoint/final.h5")

# End epoch
print("Finished: Loss: {:.3f}, Best Acc: {:.3%}".format(epoch_loss_avg.result().numpy(), curr))

Train 23: 100%|██████████| 1000/1000 [10:38<00:00,  1.57batch/s, accuracy=0.856, loss=0.429]
Val 23: 100%|██████████| 200/200 [01:57<00:00,  1.70batch/s, accuracy=0.785, loss=0.743]
Train 24: 100%|██████████| 1000/1000 [10:35<00:00,  1.57batch/s, accuracy=0.859, loss=0.425]
Val 24: 100%|██████████| 200/200 [01:54<00:00,  1.75batch/s, accuracy=0.76, loss=0.662]
Train 25: 100%|██████████| 1000/1000 [10:26<00:00,  1.60batch/s, accuracy=0.867, loss=0.415]
Val 25: 100%|██████████| 200/200 [01:55<00:00,  1.73batch/s, accuracy=0.68, loss=1.04]
Train 26: 100%|██████████| 1000/1000 [10:27<00:00,  1.59batch/s, accuracy=0.84, loss=0.456]
Val 26: 100%|██████████| 200/200 [01:52<00:00,  1.78batch/s, accuracy=0.775, loss=0.714]
Train 27: 100%|██████████| 1000/1000 [10:28<00:00,  1.59batch/s, accuracy=0.867, loss=0.414]
Val 27: 100%|██████████| 200/200 [01:53<00:00,  1.77batch/s, accuracy=0.745, loss=0.912]
Train 28:  38%|███▊      | 379/1000 [03:59<06:14,  1.66batch/s, accuracy=0.855, loss=0.507]

In [25]:
mono_roberta.save_weights(f"/content/drive/MyDrive/IR/mono roberta/checkpoint/v2_{epc}_{curr:.3f}.h5")