In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Init

In [2]:
import tensorflow as tf

In [3]:
from IPython.display import clear_output
!pip install transformers
clear_output()
import transformers
transformers.logging.set_verbosity_error()

In [4]:
from IPython.display import clear_output
!pip install ir_datasets
clear_output()

# Process Dataset

## Training

In [5]:
import ir_datasets
import pandas as pd
topic = ir_datasets.load("nfcorpus/train/nontopic")
doc = pd.DataFrame(topic.docs_iter()).set_index("doc_id").drop(["url", "title"], axis=1)
t_query = pd.DataFrame(topic.queries_iter()).set_index('query_id')
t_qrels = pd.DataFrame(topic.qrels_iter()).drop(["iteration"], axis=1)
video = ir_datasets.load("nfcorpus/train/video")
v_query = pd.DataFrame(video.queries_iter()).set_index('query_id')
v_qrels = pd.DataFrame(video.qrels_iter()).drop(["iteration"], axis=1)
train = {"topic":t_query, "video":v_query, "v_qrels":v_qrels, "t_qrels":t_qrels}

[INFO] If you have a local copy of https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/49c061fbadc52ba4d35d0e42e2d742fd
[INFO] [starting] https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz
[INFO] [finished] https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/nfcorpus.tar.gz: [00:03] [31.0MB] [9.59MB/s]


In [6]:
doc.head(2)

Unnamed: 0_level_0,abstract
doc_id,Unnamed: 1_level_1
MED-1,Abstract Background: Acrylamide is a common di...
MED-2,Abstract Human exposure to acrylamide (AA) thr...


In [7]:
train["video"].head(2)

Unnamed: 0_level_0,title,desc
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PLAIN-2427,heart of gold : turmeric vs. exercise,diet and exercise synergize to improve endothe...
PLAIN-2428,does fiber really prevent diverticulosis ?,the parable of the tiny parachute explains the...


In [8]:
train["t_qrels"].head(2)

Unnamed: 0,query_id,doc_id,relevance
0,PLAIN-3,MED-2436,3
1,PLAIN-3,MED-2437,3


## Validation

In [9]:
import ir_datasets
import pandas as pd
topic = ir_datasets.load("nfcorpus/dev/nontopic")
t_query = pd.DataFrame(topic.queries_iter()).set_index('query_id')
t_qrels = pd.DataFrame(topic.qrels_iter()).drop(["iteration"], axis=1)
video = ir_datasets.load("nfcorpus/dev/video")
v_query = pd.DataFrame(video.queries_iter()).set_index('query_id')
v_qrels = pd.DataFrame(video.qrels_iter()).drop(["iteration"], axis=1)
val = {"topic":t_query, "video":v_query, "v_qrels":v_qrels, "t_qrels":t_qrels}

In [10]:
val["topic"].head(2)

Unnamed: 0_level_0,text
query_id,Unnamed: 1_level_1
PLAIN-1,why deep fried foods may cause cancer
PLAIN-101,how to treat multiple sclerosis with diet


In [11]:
val["v_qrels"].head(2)

Unnamed: 0,query_id,doc_id,relevance
0,PLAIN-2429,MED-974,3
1,PLAIN-2429,MED-975,3


In [12]:
temp = train['topic']["text"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

6.042068361086766
6.0
24


In [13]:
temp = train['video']["title"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

5.541871921182266
5.0
14


In [14]:
temp = train['video']["desc"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

24.523399014778324
22.0
72


In [15]:
temp = doc["abstract"].str.split().apply(len)

print(temp.mean())
print(temp.median())
print(temp.max())

223.65909514056972
228.0
1461


# Tokenizer

In [16]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('allenai/biomed_roberta_base')
encoded_query =  tokenizer("query", padding="max_length", truncation=True, max_length=64, return_tensors='tf')
encoded_doc =  tokenizer("doc", padding="max_length", truncation=True, max_length=256, return_tensors='tf')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/185 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/430 [00:00<?, ?B/s]

In [17]:
print(tokenizer.decode(encoded_query['input_ids'].numpy()[0]))
print(tokenizer.decode(encoded_doc['input_ids'].numpy()[0]))

<s>query</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<s>doc</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

# Model

In [18]:
from transformers import TFRobertaModel
from keras.layers import Dropout, Dense, Dot
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy

class DPRRoberta(tf.keras.Model):

    def __init__(self, model_name, dropout_prob=0.3):
        super().__init__(name="reranker")
        self.roberta_query = TFRobertaModel.from_pretrained(model_name, from_pt=True)
        self.roberta_doc = TFRobertaModel.from_pretrained(model_name, from_pt=True)
        self.dropout = Dropout(dropout_prob)
        self.dot = Dot(axes=1)


    def call(self, query, doc, **kwargs):
        # two outputs from BERT
        trained_query = self.roberta_query(query, **kwargs)
        trained_doc= self.roberta_doc(doc, **kwargs)
        pooled_query = trained_query.pooler_output
        pooled_doc = trained_doc.pooler_output
        return self.dot([pooled_query, pooled_doc])

In [19]:
dpr_roberta = DPRRoberta("allenai/biomed_roberta_base")

Downloading:   0%|          | 0.00/656M [00:00<?, ?B/s]

In [20]:
dpr_roberta(encoded_query, encoded_doc)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[38.952766]], dtype=float32)>

In [21]:
# mono_roberta.load_weights(f"/content/drive/MyDrive/IR/mono roberta/checkpoint/19.h5")

In [22]:
def loss(model, x_q, x_d, y, training):
  # training=training is needed only if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  y_ = model(x_q, x_d, training=training)
  return loss_object(y_true=y, y_pred=y_)
def grad(model, x_q, x_d, targets):
  with tf.GradientTape() as tape:
    loss_value = loss(model, x_q, x_d, targets, training=True)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [23]:
import random
def get_random_data(data, doc):
  x = random.randint(1,3)
  if (x == 1):
    idx = random.randint(0, len(data["t_qrels"]) - 1)
    query = data["topic"].loc[data["t_qrels"].query_id[idx]].text
    doc = doc.loc[data["t_qrels"].doc_id[idx]].abstract
    y = data["t_qrels"].loc[idx].relevance
  elif (x == 2):
    idx = random.randint(0, len(data["v_qrels"]) - 1)
    query = data["video"].loc[data["v_qrels"].query_id[idx]].title
    doc = doc.loc[data["v_qrels"].doc_id[idx]].abstract
    y = data["v_qrels"].loc[idx].relevance
  elif (x == 3):
    idx = random.randint(0, len(data["v_qrels"]) - 1)
    query = data["video"].loc[data["v_qrels"].query_id[idx]].desc
    doc = doc.loc[data["v_qrels"].doc_id[idx]].abstract
    y = data["v_qrels"].loc[idx].relevance

  x_q = tokenizer(query, padding="max_length", truncation=True, max_length=64, return_tensors='tf')
  x_d = tokenizer(doc, padding="max_length", truncation=True, max_length=256, return_tensors='tf')
  return x_q, x_d, y

In [None]:
## Note: Rerunning this cell uses the same model variables
from tqdm import tqdm
import time

# Keep results for plotting
optimizer = Adam(learning_rate=1e-5, epsilon=1e-08, )
loss_object = tf.keras.losses.MeanSquaredError()
train_loss_results = []
train_mae_results = []

max_epoch = 20
curr = 13
least_mae = 0.576
for epoch in range(max_epoch):
  epc = curr + epoch + 1
  # Training loop - using batches of 32
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_mae = tf.keras.metrics.RootMeanSquaredError()

  with tqdm(range(1000), unit="batch") as tepoch:
    for i in tepoch:
      tepoch.set_description(f"Train {epc}")
      x_q, x_d, y = get_random_data(train, doc)
      y -= 1

      # Optimize the model
      loss_value, grads = grad(dpr_roberta, x_q, x_d, y)
      optimizer.apply_gradients(zip(grads, dpr_roberta.trainable_variables))

      # Track progress
      epoch_loss_avg.update_state(loss_value)  # Add current batch loss
      # Compare predicted label to actual label
      # training=True is needed only if there are layers with different
      # behavior during training versus inference (e.g. Dropout).
      epoch_mae.update_state(y, dpr_roberta(x_q, x_d, training=True))
      tepoch.set_postfix(loss= epoch_loss_avg.result().numpy(), mae= epoch_mae.result().numpy())
      time.sleep(0.1)
  
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_mae = tf.keras.metrics.RootMeanSquaredError()

  with tqdm(range(200), unit="batch") as tepoch:
    for i in tepoch:
      tepoch.set_description(f"Val {epc}")
      x_q, x_d, y = get_random_data(val, doc)
      y -= 1

      # Optimize the model
      loss_value, grads = grad(dpr_roberta, x_q, x_d, y)

      # Track progress
      epoch_loss_avg.update_state(loss_value)  # Add current batch loss
      # Compare predicted label to actual label
      # training=True is needed only if there are layers with different
      # behavior during training versus inference (e.g. Dropout).
      epoch_mae.update_state(y, dpr_roberta(x_q, x_d, training=True))
      tepoch.set_postfix(loss= epoch_loss_avg.result().numpy(), mae= epoch_mae.result().numpy())
      time.sleep(0.1)
  if (least_mae >  epoch_mae.result().numpy() or epc & 10 == 0):
    least_mae = epoch_mae.result().numpy()
    dpr_roberta.save_weights(f"/content/drive/MyDrive/IR/DPR Roberta/checkpoint/v2_{epc}_{least_mae:.3f}.h5")
  dpr_roberta.save_weights(f"/content/drive/MyDrive/IR/DPR Roberta/checkpoint/final.h5")

# End epoch
print("Finished: Loss: {:.3f}, Least Mae: {:.3%}".format(epoch_loss_avg.result().numpy(), least_mae))

Train 14: 100%|██████████| 1000/1000 [19:02<00:00,  1.14s/batch, loss=0.523, mae=0.768]
Val 14: 100%|██████████| 200/200 [03:23<00:00,  1.02s/batch, loss=0.58, mae=0.65]
Train 15: 100%|██████████| 1000/1000 [19:00<00:00,  1.14s/batch, loss=0.405, mae=0.639]
Val 15: 100%|██████████| 200/200 [03:27<00:00,  1.04s/batch, loss=0.311, mae=0.571]
Train 16:  26%|██▌       | 259/1000 [05:06<13:46,  1.12s/batch, loss=0.418, mae=0.62]

In [None]:
dpr_roberta.save_weights(f"/content/drive/MyDrive/IR/DPR Roberta/checkpoint/v2_{epc}_{least_mae:.3f}.h5") 