In [2]:
!pip install -q transformers datasets docs-ranking-metrics

In [4]:
import pandas as pd
import torch
import random
import numpy as np
import pickle

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
from tqdm.notebook import tqdm

# by default next import fails, need manually fix import
# inside ranking_metrics.py to from .evaluation_metrics import ...
from ranking_metrics.ranking_metrics import RankingMetrics, Bm25, LaBSE  

In [5]:
SEED = 42
NUM_QUERIES = 100
METRICS = [Bm25(), LaBSE()]
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    print(torch.cuda.get_device_name())

Downloading (…)be010/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)168ebbe010/README.md:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading (…)8ebbe010/config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)be010/tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)168ebbe010/vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading (…)ebbe010/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

Tesla T4


In [9]:
def load_tokenizer_and_model(model_name_or_path):
    return GPT2Tokenizer.from_pretrained(model_name_or_path), GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)


def preprocess_generated(text: str) -> str:
  return ' '.join(text.split())


def generate(model, tok, text, **kwargs):
  input_ids = tok.encode(text, return_tensors="pt").to(DEVICE)
  out = model.generate(input_ids.to(DEVICE), pad_token_id=50256, **kwargs)
  return list(map(tok.decode, out))


def run_experiment(model, tok, df, **kwargs):
  rm = RankingMetrics(METRICS)
  for row in tqdm(df.loc[:NUM_QUERIES].itertuples(index=False), total=NUM_QUERIES):
    passage_text, is_selected, query = row.passages['passage_text'].tolist(), row.passages['is_selected'].tolist(), row.query
    generated = generate(model, tok, query + ' ', **kwargs)[0][len(query):]

    passage_text.append(preprocess_generated(generated))
    is_selected.append(2)  # not sure about it

    rm.update(query, passage_text, is_selected)

  return rm.get()

In [10]:
dataset = load_dataset('ms_marco', 'v1.1', split='test').to_pandas()
tok, model = load_tokenizer_and_model("sberbank-ai/rugpt3medium_based_on_gpt2")

Downloading builder script:   0%|          | 0.00/8.52k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.15k [00:00<?, ?B/s]

Downloading and preparing dataset ms_marco/v1.1 to /root/.cache/huggingface/datasets/ms_marco/v1.1/1.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/111M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

Dataset ms_marco downloaded and prepared to /root/.cache/huggingface/datasets/ms_marco/v1.1/1.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84. Subsequent calls will reuse this data.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

In [144]:
# res = run_experiment(model, tok, dataset, max_new_tokens=20)

  0%|          | 0/100 [00:00<?, ?it/s]

# Parameters that control the length of the output


## `max_length` (int, defaults to 20) 

In [11]:
for max_length in tqdm([20, 40, 60], leave=False):
  res = run_experiment(model, tok, dataset, max_length=max_length)
  print(f'max_length={max_length}')
  print(res)
  print('_'*100)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 25, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 21, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


max_length=20
{'Bm25_AverageLoc': 6.8316831683168315, 'LaBSE_AverageLoc': 4.2772277227722775, 'Bm25_Top@1': 0.1782178217821782, 'Bm25_Top@3': 0.27722772277227725, 'Bm25_Top@5': 0.3465346534653465, 'LaBSE_Top@1': 0.40594059405940597, 'LaBSE_Top@3': 0.5643564356435643, 'LaBSE_Top@5': 0.6336633663366337, 'Bm25_FDARO': 0.2871287128712871, 'LaBSE_FDARO': 0.6435643564356436}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

max_length=40
{'Bm25_AverageLoc': 5.792079207920792, 'LaBSE_AverageLoc': 4.376237623762377, 'Bm25_Top@1': 0.297029702970297, 'Bm25_Top@3': 0.38613861386138615, 'Bm25_Top@5': 0.45544554455445546, 'LaBSE_Top@1': 0.43564356435643564, 'LaBSE_Top@3': 0.5247524752475248, 'LaBSE_Top@5': 0.6534653465346535, 'Bm25_FDARO': 0.37623762376237624, 'LaBSE_FDARO': 0.6237623762376238}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

max_length=60
{'Bm25_AverageLoc': 5.673267326732673, 'LaBSE_AverageLoc': 4.0495049504950495, 'Bm25_Top@1': 0.3069306930693069, 'Bm25_Top@3': 0.40594059405940597, 'Bm25_Top@5': 0.46534653465346537, 'LaBSE_Top@1': 0.48514851485148514, 'LaBSE_Top@3': 0.6039603960396039, 'LaBSE_Top@5': 0.6732673267326733, 'Bm25_FDARO': 0.38613861386138615, 'LaBSE_FDARO': 0.6633663366336634}
____________________________________________________________________________________________________


In [12]:
MAX_LENGTH = 60

## `early_stopping` (bool, defaults to False)

In [13]:
for early_stopping in tqdm([True, False], leave=False):
  res = run_experiment(model, tok, dataset, max_length=MAX_LENGTH, early_stopping=True)
  print(f'early_stopping={early_stopping}')
  print(res)
  print('_'*100)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

early_stopping=True
{'Bm25_AverageLoc': 5.673267326732673, 'LaBSE_AverageLoc': 4.0495049504950495, 'Bm25_Top@1': 0.3069306930693069, 'Bm25_Top@3': 0.40594059405940597, 'Bm25_Top@5': 0.46534653465346537, 'LaBSE_Top@1': 0.48514851485148514, 'LaBSE_Top@3': 0.6039603960396039, 'LaBSE_Top@5': 0.6732673267326733, 'Bm25_FDARO': 0.38613861386138615, 'LaBSE_FDARO': 0.6633663366336634}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

early_stopping=False
{'Bm25_AverageLoc': 5.673267326732673, 'LaBSE_AverageLoc': 4.0495049504950495, 'Bm25_Top@1': 0.3069306930693069, 'Bm25_Top@3': 0.40594059405940597, 'Bm25_Top@5': 0.46534653465346537, 'LaBSE_Top@1': 0.48514851485148514, 'LaBSE_Top@3': 0.6039603960396039, 'LaBSE_Top@5': 0.6732673267326733, 'Bm25_FDARO': 0.38613861386138615, 'LaBSE_FDARO': 0.6633663366336634}
____________________________________________________________________________________________________


In [14]:
EARLY_STOPPING = False

# Parameters that control the generation strategy used

## `do_sample` (bool, defaults to False)

In [15]:
for do_sample in tqdm([True, False], leave=False):
  res = run_experiment(model, tok, dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=do_sample)
  print(f'do_sample={do_sample}')
  print(res)
  print('_'*100)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

do_sample=True
{'Bm25_AverageLoc': 7.435643564356436, 'LaBSE_AverageLoc': 2.277227722772277, 'Bm25_Top@1': 0.07920792079207921, 'Bm25_Top@3': 0.16831683168316833, 'Bm25_Top@5': 0.2376237623762376, 'LaBSE_Top@1': 0.6732673267326733, 'LaBSE_Top@3': 0.8217821782178217, 'LaBSE_Top@5': 0.8613861386138614, 'Bm25_FDARO': 0.19801980198019803, 'LaBSE_FDARO': 0.8613861386138614}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

do_sample=False
{'Bm25_AverageLoc': 5.673267326732673, 'LaBSE_AverageLoc': 4.0495049504950495, 'Bm25_Top@1': 0.3069306930693069, 'Bm25_Top@3': 0.40594059405940597, 'Bm25_Top@5': 0.46534653465346537, 'LaBSE_Top@1': 0.48514851485148514, 'LaBSE_Top@3': 0.6039603960396039, 'LaBSE_Top@5': 0.6732673267326733, 'Bm25_FDARO': 0.38613861386138615, 'LaBSE_FDARO': 0.6633663366336634}
____________________________________________________________________________________________________


In [16]:
DO_SAMPLE = True

## `num_beams` (int, defaults to 1)

In [17]:
for num_beams in tqdm([1, 3, 5, 7], leave=False):
  res = run_experiment(model, tok, dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=num_beams)
  print(f'num_beams={num_beams}')
  print(res)
  print('_'*100)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

num_beams=1
{'Bm25_AverageLoc': 7.653465346534653, 'LaBSE_AverageLoc': 2.108910891089109, 'Bm25_Top@1': 0.07920792079207921, 'Bm25_Top@3': 0.1188118811881188, 'Bm25_Top@5': 0.21782178217821782, 'LaBSE_Top@1': 0.6831683168316832, 'LaBSE_Top@3': 0.8316831683168316, 'LaBSE_Top@5': 0.9207920792079208, 'Bm25_FDARO': 0.18811881188118812, 'LaBSE_FDARO': 0.8910891089108911}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

num_beams=3
{'Bm25_AverageLoc': 6.00990099009901, 'LaBSE_AverageLoc': 3.742574257425743, 'Bm25_Top@1': 0.24752475247524752, 'Bm25_Top@3': 0.36633663366336633, 'Bm25_Top@5': 0.44554455445544555, 'LaBSE_Top@1': 0.48514851485148514, 'LaBSE_Top@3': 0.6336633663366337, 'LaBSE_Top@5': 0.693069306930693, 'Bm25_FDARO': 0.40594059405940597, 'LaBSE_FDARO': 0.6831683168316832}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

num_beams=5
{'Bm25_AverageLoc': 6.6138613861386135, 'LaBSE_AverageLoc': 3.881188118811881, 'Bm25_Top@1': 0.22772277227722773, 'Bm25_Top@3': 0.2871287128712871, 'Bm25_Top@5': 0.3564356435643564, 'LaBSE_Top@1': 0.5148514851485149, 'LaBSE_Top@3': 0.6039603960396039, 'LaBSE_Top@5': 0.6633663366336634, 'Bm25_FDARO': 0.3069306930693069, 'LaBSE_FDARO': 0.6633663366336634}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

num_beams=7
{'Bm25_AverageLoc': 6.257425742574258, 'LaBSE_AverageLoc': 4.346534653465347, 'Bm25_Top@1': 0.297029702970297, 'Bm25_Top@3': 0.3465346534653465, 'Bm25_Top@5': 0.36633663366336633, 'LaBSE_Top@1': 0.504950495049505, 'LaBSE_Top@3': 0.5643564356435643, 'LaBSE_Top@5': 0.6138613861386139, 'Bm25_FDARO': 0.37623762376237624, 'LaBSE_FDARO': 0.6138613861386139}
____________________________________________________________________________________________________


In [18]:
NUM_BEAMS = 5

## Parameters for manipulation of the model output logits

## `temperature` (float, defaults to 1.0)

In [19]:
for temperature in tqdm([1., .95, .9], leave=False):
  res = run_experiment(model, tok, dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS,
                       temperature=temperature)
  print(f'temperature={temperature}')
  print(res)
  print('_'*100)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

temperature=1.0
{'Bm25_AverageLoc': 5.6138613861386135, 'LaBSE_AverageLoc': 4.524752475247524, 'Bm25_Top@1': 0.3465346534653465, 'Bm25_Top@3': 0.40594059405940597, 'Bm25_Top@5': 0.45544554455445546, 'LaBSE_Top@1': 0.4752475247524752, 'LaBSE_Top@3': 0.5346534653465347, 'LaBSE_Top@5': 0.594059405940594, 'Bm25_FDARO': 0.43564356435643564, 'LaBSE_FDARO': 0.6039603960396039}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

temperature=0.95
{'Bm25_AverageLoc': 6.267326732673268, 'LaBSE_AverageLoc': 4.445544554455446, 'Bm25_Top@1': 0.2871287128712871, 'Bm25_Top@3': 0.3069306930693069, 'Bm25_Top@5': 0.36633663366336633, 'LaBSE_Top@1': 0.49504950495049505, 'LaBSE_Top@3': 0.5643564356435643, 'LaBSE_Top@5': 0.594059405940594, 'Bm25_FDARO': 0.3564356435643564, 'LaBSE_FDARO': 0.6039603960396039}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

temperature=0.9
{'Bm25_AverageLoc': 6.198019801980198, 'LaBSE_AverageLoc': 4.148514851485149, 'Bm25_Top@1': 0.26732673267326734, 'Bm25_Top@3': 0.31683168316831684, 'Bm25_Top@5': 0.38613861386138615, 'LaBSE_Top@1': 0.504950495049505, 'LaBSE_Top@3': 0.5841584158415841, 'LaBSE_Top@5': 0.6336633663366337, 'Bm25_FDARO': 0.32673267326732675, 'LaBSE_FDARO': 0.6633663366336634}
____________________________________________________________________________________________________


In [20]:
TEMPERATURE = 1.

## `top_k` (int, defaults to 50)

In [None]:
for top_k in tqdm([5, 10, 50, 100], leave=False):
  res = run_experiment(model, tok, dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS, temperature=TEMPERATURE,
                       top_k=top_k)
  print(f'top_k={top_k}')
  print(res)
  print('_'*100)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

top_k=5
{'Bm25_AverageLoc': 6.158415841584159, 'LaBSE_AverageLoc': 4.128712871287129, 'Bm25_Top@1': 0.3069306930693069, 'Bm25_Top@3': 0.3564356435643564, 'Bm25_Top@5': 0.38613861386138615, 'LaBSE_Top@1': 0.5247524752475248, 'LaBSE_Top@3': 0.5544554455445545, 'LaBSE_Top@5': 0.6336633663366337, 'Bm25_FDARO': 0.37623762376237624, 'LaBSE_FDARO': 0.6435643564356436}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

top_k=10
{'Bm25_AverageLoc': 6.465346534653466, 'LaBSE_AverageLoc': 3.792079207920792, 'Bm25_Top@1': 0.25742574257425743, 'Bm25_Top@3': 0.3069306930693069, 'Bm25_Top@5': 0.3564356435643564, 'LaBSE_Top@1': 0.5445544554455446, 'LaBSE_Top@3': 0.6237623762376238, 'LaBSE_Top@5': 0.693069306930693, 'Bm25_FDARO': 0.32673267326732675, 'LaBSE_FDARO': 0.7029702970297029}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

top_k=50
{'Bm25_AverageLoc': 5.900990099009901, 'LaBSE_AverageLoc': 4.475247524752476, 'Bm25_Top@1': 0.297029702970297, 'Bm25_Top@3': 0.3564356435643564, 'Bm25_Top@5': 0.42574257425742573, 'LaBSE_Top@1': 0.45544554455445546, 'LaBSE_Top@3': 0.5148514851485149, 'LaBSE_Top@5': 0.5841584158415841, 'Bm25_FDARO': 0.39603960396039606, 'LaBSE_FDARO': 0.6435643564356436}
____________________________________________________________________________________________________


  0%|          | 0/100 [00:00<?, ?it/s]

top_k=100
{'Bm25_AverageLoc': 6.445544554455446, 'LaBSE_AverageLoc': 4.0, 'Bm25_Top@1': 0.26732673267326734, 'Bm25_Top@3': 0.297029702970297, 'Bm25_Top@5': 0.3564356435643564, 'LaBSE_Top@1': 0.504950495049505, 'LaBSE_Top@3': 0.594059405940594, 'LaBSE_Top@5': 0.6534653465346535, 'Bm25_FDARO': 0.31683168316831684, 'LaBSE_FDARO': 0.6831683168316832}
____________________________________________________________________________________________________


In [None]:
TOP_K = 10

## `top_p` (float, defaults to 1.0)

In [None]:
for top_p in tqdm([1., .95, .9], leave=False):
  res = run_experiment(model, tok, dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS, temperature=TEMPERATURE, top_k=TOP_K,
                       top_p=top_p)
  print(f'top_p={top_p}')
  print(res)
  print('_'*100)

In [None]:
TOP_P = 1.

## `no_repeat_ngram_size` (int, defaults to 0)

In [None]:
for no_repeat_ngram_size in tqdm(range(4), leave=False):
  res = run_experiment(model, tok, dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS, temperature=TEMPERATURE, top_k=TOP_K, top_p=TOP_P,
                       no_repeat_ngram_size=no_repeat_ngram_size)
  print(f'no_repeat_ngram_size={no_repeat_ngram_size}')
  print(res)
  print('_'*100)

In [None]:
NO_REPEAT_NGRAM_SIZE = 3