In [1]:
!pip install -q transformers datasets docs-ranking-metrics xformers
!gdown 1D_Hz-4BK3tJB0zG4-SyEyF8b3S6Sqx0z

Downloading...
From: https://drive.google.com/uc?id=1D_Hz-4BK3tJB0zG4-SyEyF8b3S6Sqx0z
To: /content/data_for_rk_model.tsv
100% 3.92G/3.92G [00:20<00:00, 188MB/s]


In [1]:
import torch
import random
import numpy as np
import pandas as pd

from transformers import pipeline  # GPT2LMHeadModel, GPT2Tokenizer,
# from datasets import load_dataset
from tqdm.notebook import tqdm

from docs_ranking_metrics.ranking_metrics import RankingMetrics, Bm25, MsMarcoCE, LaBSE

In [2]:
SEED = 42
NUM_QUERIES = 100
# RANKING_MODELS = [Bm25(), MsMarcoCE()]  # , LaBSE(), MsMarcoST()
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
RANKING_MODEL_NAME = 'LaBSE'
RANKING_METRIC_NAME = 'FDARO@v2'

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    print(torch.cuda.get_device_name())

Tesla T4


In [9]:
def load_tokenizer_and_model(model_name_or_path):
    return GPT2Tokenizer.from_pretrained(model_name_or_path), GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)


def preprocess_generated(text: str) -> str:
  # text = text.replace('\n', ' ').replace('\xa0', ' ')
  return ' '.join(text.split())


# def generate(model, text, **kwargs):
#   out = model(text, **kwargs)
#   return out[0]['generated_text']


def run_experiment(df, **kwargs):
  global model
  rm = RankingMetrics([Bm25(), LaBSE()])
  for row in tqdm(df.itertuples(index=False), total=NUM_QUERIES):
    # passage_text, is_selected, query = row.passages['passage_text'].tolist(), row.passages['is_selected'].tolist(), row.query
    passage_text, is_selected, query = row.body, row.label, row.query

    # generated = generate(model, query + ' ', **kwargs)[len(query):]
    generated = model(query + ' ', pad_token_id=50256, **kwargs)[0]['generated_text'][len(query):]

    passage_text.append(preprocess_generated(generated))
    is_selected.append(RankingMetrics.FAKE_DOC_LABEL)

    rm.update(query, passage_text, is_selected)
  rm.show_metrics()
  return rm.get()

def select_best_param(ranking_model_name, ranking_metric_name, list_of_results):
  evaluation_function = min if 'Loc' in ranking_metric_name else max

  return evaluation_function(list_of_results, key=lambda metrics: metrics[1][f'{ranking_model_name}_{ranking_metric_name}'])[0]

In [4]:
# dataset = load_dataset('ms_marco', 'v1.1', split='test').to_pandas()
# tok, model = load_tokenizer_and_model("sberbank-ai/rugpt3medium_based_on_gpt2")
dataset = pd.read_csv('data_for_rk_model.tsv', delimiter='\t')\
          .groupby('query').agg({'body': list, 'label': list})\
          .reset_index()\
          .sample(NUM_QUERIES, random_state=SEED)  #.loc[:NUM_QUERIES]
model = pipeline(model="sberbank-ai/rugpt3small_based_on_gpt2", task='text-generation')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# res = run_experiment(dataset, max_new_tokens=20)

In [7]:
preprocess_generated(model('сгенерируй продолжение текста')[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'сгенерируй продолжение текста, а потом добавь в текст.'

# Parameters that control the length of the output


## `max_length` (int, defaults to 20)

In [10]:
results = []
for max_length in tqdm([20, 40, 60], desc='max_length', leave=False):
  print(f'max_length={max_length}')
  res = run_experiment(dataset, max_length=max_length)
  results.append((max_length, res))
  print('_'*100)

max_length:   0%|          | 0/3 [00:00<?, ?it/s]

max_length=20


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 31, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 35, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 55, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 24.59   LaBSE_AverageLoc: 18.14   
-----------------------------
Bm25_AverageRelLoc: 0.98   LaBSE_AverageRelLoc: 0.72   
-----------------------------
Bm25_Top@1: 0.01   LaBSE_Top@1: 0.08   
Bm25_Top@3: 0.01   LaBSE_Top@3: 0.13   
Bm25_Top@5: 0.02   LaBSE_Top@5: 0.19   
-----------------------------
Bm25_FDARO@v1: 0.03   LaBSE_FDARO@v1: 0.16   
Bm25_FDARO@v2: 0.1   LaBSE_FDARO@v2: 0.65   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
max_length=40


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 49.25   LaBSE_AverageLoc: 36.57   
-----------------------------
Bm25_AverageRelLoc: 1.88   LaBSE_AverageRelLoc: 1.4   
-----------------------------
Bm25_Top@1: 0.01   LaBSE_Top@1: 0.12   
Bm25_Top@3: 0.04   LaBSE_Top@3: 0.25   
Bm25_Top@5: 0.05   LaBSE_Top@5: 0.39   
-----------------------------
Bm25_FDARO@v1: 0.04   LaBSE_FDARO@v1: 0.19   
Bm25_FDARO@v2: 0.14   LaBSE_FDARO@v2: 0.69   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
max_length=60


  0%|          | 0/100 [00:00<?, ?it/s]

Bm25_AverageLoc: 74.69   LaBSE_AverageLoc: 56.42   
-----------------------------
Bm25_AverageRelLoc: 2.74   LaBSE_AverageRelLoc: 2.08   
-----------------------------
Bm25_Top@1: 0.02   LaBSE_Top@1: 0.12   
Bm25_Top@3: 0.06   LaBSE_Top@3: 0.33   
Bm25_Top@5: 0.1   LaBSE_Top@5: 0.52   
-----------------------------
Bm25_FDARO@v1: 0.06   LaBSE_FDARO@v1: 0.19   
Bm25_FDARO@v2: 0.14   LaBSE_FDARO@v2: 0.69   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________


In [11]:
MAX_LENGTH = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
MAX_LENGTH

40

## `early_stopping` (bool, defaults to False)

In [12]:
results = []
for early_stopping in tqdm([True, False], desc='early_stopping', leave=False):
  print(f'early_stopping={early_stopping}')
  res = run_experiment(dataset, max_length=MAX_LENGTH, early_stopping=early_stopping)
  results.append((early_stopping, res))
  print('_'*100)

early_stopping:   0%|          | 0/2 [00:00<?, ?it/s]

early_stopping=True


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 101.64   LaBSE_AverageLoc: 76.85   
-----------------------------
Bm25_AverageRelLoc: 3.59   LaBSE_AverageRelLoc: 2.72   
-----------------------------
Bm25_Top@1: 0.02   LaBSE_Top@1: 0.12   
Bm25_Top@3: 0.07   LaBSE_Top@3: 0.37   
Bm25_Top@5: 0.11   LaBSE_Top@5: 0.66   
-----------------------------
Bm25_FDARO@v1: 0.05   LaBSE_FDARO@v1: 0.19   
Bm25_FDARO@v2: 0.13   LaBSE_FDARO@v2: 0.69   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
early_stopping=False


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 129.62   LaBSE_AverageLoc: 98.28   
-----------------------------
Bm25_AverageRelLoc: 4.42   LaBSE_AverageRelLoc: 3.36   
-----------------------------
Bm25_Top@1: 0.02   LaBSE_Top@1: 0.12   
Bm25_Top@3: 0.05   LaBSE_Top@3: 0.38   
Bm25_Top@5: 0.07   LaBSE_Top@5: 0.74   
-----------------------------
Bm25_FDARO@v1: 0.04   LaBSE_FDARO@v1: 0.19   
Bm25_FDARO@v2: 0.14   LaBSE_FDARO@v2: 0.69   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________


In [13]:
EARLY_STOPPING = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
EARLY_STOPPING

True

# Parameters that control the generation strategy used

## `do_sample` (bool, defaults to False)

In [14]:
results = []
for do_sample in tqdm([False, True], desc='do_sample', leave=False):
  print(f'do_sample={do_sample}')
  res = run_experiment(dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=do_sample)
  results.append((do_sample, res))
  print('_'*100)

do_sample:   0%|          | 0/2 [00:00<?, ?it/s]

do_sample=False


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 158.42   LaBSE_AverageLoc: 120.71   
-----------------------------
Bm25_AverageRelLoc: 5.22   LaBSE_AverageRelLoc: 3.99   
-----------------------------
Bm25_Top@1: 0.02   LaBSE_Top@1: 0.12   
Bm25_Top@3: 0.04   LaBSE_Top@3: 0.38   
Bm25_Top@5: 0.09   LaBSE_Top@5: 0.78   
-----------------------------
Bm25_FDARO@v1: 0.04   LaBSE_FDARO@v1: 0.19   
Bm25_FDARO@v2: 0.14   LaBSE_FDARO@v2: 0.69   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
do_sample=True


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 187.19   LaBSE_AverageLoc: 144.41   
-----------------------------
Bm25_AverageRelLoc: 5.96   LaBSE_AverageRelLoc: 4.62   
-----------------------------
Bm25_Top@1: 0.05   LaBSE_Top@1: 0.17   
Bm25_Top@3: 0.1   LaBSE_Top@3: 0.49   
Bm25_Top@5: 0.15   LaBSE_Top@5: 0.88   
-----------------------------
Bm25_FDARO@v1: 0.1   LaBSE_FDARO@v1: 0.24   
Bm25_FDARO@v2: 0.31   LaBSE_FDARO@v2: 0.8   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________


In [15]:
DO_SAMPLE = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
DO_SAMPLE

True

## `num_beams` (int, defaults to 1)

In [18]:
# results = []
for num_beams in tqdm([1, 2, 3], desc='num_beams', leave=False):
  print(f'num_beams={num_beams}')
  res = run_experiment(dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=num_beams)
  results.append((num_beams, res))
  print('_'*100)

num_beams:   0%|          | 0/1 [00:00<?, ?it/s]

num_beams=2


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


RuntimeError: ignored

In [19]:
NUM_BEAMS = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
NUM_BEAMS

1

## Parameters for manipulation of the model output logits

## `temperature` (float, defaults to 1.0)

In [20]:
results = []
for temperature in tqdm([1., .95, .9], desc='temperature', leave=False):
  print(f'temperature={temperature}')
  res = run_experiment(dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS,
                       temperature=temperature)
  results.append((temperature, res))
  print('_'*100)

temperature:   0%|          | 0/3 [00:00<?, ?it/s]

temperature=1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 331.39   LaBSE_AverageLoc: 265.27   
-----------------------------
Bm25_AverageRelLoc: 9.21   LaBSE_AverageRelLoc: 7.42   
-----------------------------
Bm25_Top@1: 0.18   LaBSE_Top@1: 0.41   
Bm25_Top@3: 0.33   LaBSE_Top@3: 0.88   
Bm25_Top@5: 0.39   LaBSE_Top@5: 1.3   
-----------------------------
Bm25_FDARO@v1: 0.25   LaBSE_FDARO@v1: 0.51   
Bm25_FDARO@v2: 0.54   LaBSE_FDARO@v2: 0.92   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
temperature=0.95


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 366.26   LaBSE_AverageLoc: 294.65   
-----------------------------
Bm25_AverageRelLoc: 9.9   LaBSE_AverageRelLoc: 8.02   
-----------------------------
Bm25_Top@1: 0.21   LaBSE_Top@1: 0.42   
Bm25_Top@3: 0.37   LaBSE_Top@3: 0.92   
Bm25_Top@5: 0.44   LaBSE_Top@5: 1.36   
-----------------------------
Bm25_FDARO@v1: 0.27   LaBSE_FDARO@v1: 0.55   
Bm25_FDARO@v2: 0.59   LaBSE_FDARO@v2: 0.94   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
temperature=0.9


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 401.78   LaBSE_AverageLoc: 325.15   
-----------------------------
Bm25_AverageRelLoc: 10.58   LaBSE_AverageRelLoc: 8.61   
-----------------------------
Bm25_Top@1: 0.22   LaBSE_Top@1: 0.45   
Bm25_Top@3: 0.4   LaBSE_Top@3: 0.96   
Bm25_Top@5: 0.46   LaBSE_Top@5: 1.42   
-----------------------------
Bm25_FDARO@v1: 0.26   LaBSE_FDARO@v1: 0.56   
Bm25_FDARO@v2: 0.61   LaBSE_FDARO@v2: 0.95   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________


In [21]:
TEMPERATURE = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
TEMPERATURE

0.9

## `top_k` (int, defaults to 50)

In [22]:
results = []
for top_k in tqdm([5, 10, 50, 100], desc='top_k', leave=False):
  print(f'top_k={top_k}')
  res = run_experiment(dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS, temperature=TEMPERATURE,
                       top_k=top_k)
  results.append((top_k, res))
  print('_'*100)

top_k:   0%|          | 0/4 [00:00<?, ?it/s]

top_k=5


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 437.7   LaBSE_AverageLoc: 355.8   
-----------------------------
Bm25_AverageRelLoc: 11.23   LaBSE_AverageRelLoc: 9.17   
-----------------------------
Bm25_Top@1: 0.24   LaBSE_Top@1: 0.46   
Bm25_Top@3: 0.45   LaBSE_Top@3: 1.05   
Bm25_Top@5: 0.55   LaBSE_Top@5: 1.53   
-----------------------------
Bm25_FDARO@v1: 0.31   LaBSE_FDARO@v1: 0.57   
Bm25_FDARO@v2: 0.66   LaBSE_FDARO@v2: 0.96   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
top_k=10


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 474.82   LaBSE_AverageLoc: 387.66   
-----------------------------
Bm25_AverageRelLoc: 11.88   LaBSE_AverageRelLoc: 9.74   
-----------------------------
Bm25_Top@1: 0.26   LaBSE_Top@1: 0.48   
Bm25_Top@3: 0.48   LaBSE_Top@3: 1.1   
Bm25_Top@5: 0.63   LaBSE_Top@5: 1.58   
-----------------------------
Bm25_FDARO@v1: 0.32   LaBSE_FDARO@v1: 0.58   
Bm25_FDARO@v2: 0.68   LaBSE_FDARO@v2: 0.96   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
top_k=50


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 512.67   LaBSE_AverageLoc: 421.3   
-----------------------------
Bm25_AverageRelLoc: 12.51   LaBSE_AverageRelLoc: 10.33   
-----------------------------
Bm25_Top@1: 0.28   LaBSE_Top@1: 0.5   
Bm25_Top@3: 0.53   LaBSE_Top@3: 1.17   
Bm25_Top@5: 0.72   LaBSE_Top@5: 1.68   
-----------------------------
Bm25_FDARO@v1: 0.34   LaBSE_FDARO@v1: 0.61   
Bm25_FDARO@v2: 0.68   LaBSE_FDARO@v2: 0.97   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
top_k=100


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 552.46   LaBSE_AverageLoc: 455.0   
-----------------------------
Bm25_AverageRelLoc: 13.15   LaBSE_AverageRelLoc: 10.89   
-----------------------------
Bm25_Top@1: 0.28   LaBSE_Top@1: 0.51   
Bm25_Top@3: 0.56   LaBSE_Top@3: 1.25   
Bm25_Top@5: 0.75   LaBSE_Top@5: 1.77   
-----------------------------
Bm25_FDARO@v1: 0.36   LaBSE_FDARO@v1: 0.62   
Bm25_FDARO@v2: 0.69   LaBSE_FDARO@v2: 0.97   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________


In [23]:
TOP_K = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
TOP_K

50

## `top_p` (float, defaults to 1.0)

In [24]:
results = []
for top_p in tqdm([1., .95, .9], desc='top_p', leave=False):
  print(f'top_p={top_p}')
  res = run_experiment(dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS, temperature=TEMPERATURE, top_k=TOP_K,
                       top_p=top_p)
  results.append((top_p, res))
  print('_'*100)

top_p:   0%|          | 0/3 [00:00<?, ?it/s]

top_p=1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 592.65   LaBSE_AverageLoc: 489.52   
-----------------------------
Bm25_AverageRelLoc: 13.78   LaBSE_AverageRelLoc: 11.44   
-----------------------------
Bm25_Top@1: 0.28   LaBSE_Top@1: 0.52   
Bm25_Top@3: 0.61   LaBSE_Top@3: 1.3   
Bm25_Top@5: 0.82   LaBSE_Top@5: 1.83   
-----------------------------
Bm25_FDARO@v1: 0.36   LaBSE_FDARO@v1: 0.65   
Bm25_FDARO@v2: 0.7   LaBSE_FDARO@v2: 0.98   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
top_p=0.95


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 634.48   LaBSE_AverageLoc: 525.33   
-----------------------------
Bm25_AverageRelLoc: 14.42   LaBSE_AverageRelLoc: 12.0   
-----------------------------
Bm25_Top@1: 0.28   LaBSE_Top@1: 0.53   
Bm25_Top@3: 0.61   LaBSE_Top@3: 1.32   
Bm25_Top@5: 0.85   LaBSE_Top@5: 1.89   
-----------------------------
Bm25_FDARO@v1: 0.36   LaBSE_FDARO@v1: 0.65   
Bm25_FDARO@v2: 0.69   LaBSE_FDARO@v2: 0.98   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
top_p=0.9


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 677.49   LaBSE_AverageLoc: 561.9   
-----------------------------
Bm25_AverageRelLoc: 15.05   LaBSE_AverageRelLoc: 12.55   
-----------------------------
Bm25_Top@1: 0.31   LaBSE_Top@1: 0.56   
Bm25_Top@3: 0.64   LaBSE_Top@3: 1.36   
Bm25_Top@5: 0.85   LaBSE_Top@5: 1.98   
-----------------------------
Bm25_FDARO@v1: 0.38   LaBSE_FDARO@v1: 0.68   
Bm25_FDARO@v2: 0.7   LaBSE_FDARO@v2: 0.98   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________


In [25]:
TOP_P = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
TOP_P

1.0

## `no_repeat_ngram_size` (int, defaults to 0)

In [26]:
results = []
for no_repeat_ngram_size in tqdm(range(4), leave=False):
  print(f'no_repeat_ngram_size={no_repeat_ngram_size}')
  res = run_experiment(dataset, max_length=MAX_LENGTH, early_stopping=EARLY_STOPPING, do_sample=DO_SAMPLE, num_beams=NUM_BEAMS, temperature=TEMPERATURE, top_k=TOP_K, top_p=TOP_P,
                       no_repeat_ngram_size=no_repeat_ngram_size)
  results.append((no_repeat_ngram_size, res))
  print('_'*100)

  0%|          | 0/4 [00:00<?, ?it/s]

no_repeat_ngram_size=0


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 721.23   LaBSE_AverageLoc: 600.28   
-----------------------------
Bm25_AverageRelLoc: 15.68   LaBSE_AverageRelLoc: 13.11   
-----------------------------
Bm25_Top@1: 0.32   LaBSE_Top@1: 0.57   
Bm25_Top@3: 0.65   LaBSE_Top@3: 1.38   
Bm25_Top@5: 0.88   LaBSE_Top@5: 2.03   
-----------------------------
Bm25_FDARO@v1: 0.39   LaBSE_FDARO@v1: 0.71   
Bm25_FDARO@v2: 0.73   LaBSE_FDARO@v2: 0.98   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
no_repeat_ngram_size=1


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 767.69   LaBSE_AverageLoc: 640.57   
-----------------------------
Bm25_AverageRelLoc: 16.33   LaBSE_AverageRelLoc: 13.69   
-----------------------------
Bm25_Top@1: 0.33   LaBSE_Top@1: 0.57   
Bm25_Top@3: 0.64   LaBSE_Top@3: 1.4   
Bm25_Top@5: 0.88   LaBSE_Top@5: 2.06   
-----------------------------
Bm25_FDARO@v1: 0.39   LaBSE_FDARO@v1: 0.72   
Bm25_FDARO@v2: 0.73   LaBSE_FDARO@v2: 0.98   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
no_repeat_ngram_size=2


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 814.44   LaBSE_AverageLoc: 681.38   
-----------------------------
Bm25_AverageRelLoc: 16.97   LaBSE_AverageRelLoc: 14.26   
-----------------------------
Bm25_Top@1: 0.33   LaBSE_Top@1: 0.58   
Bm25_Top@3: 0.65   LaBSE_Top@3: 1.42   
Bm25_Top@5: 0.87   LaBSE_Top@5: 2.11   
-----------------------------
Bm25_FDARO@v1: 0.39   LaBSE_FDARO@v1: 0.73   
Bm25_FDARO@v2: 0.73   LaBSE_FDARO@v2: 0.98   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________
no_repeat_ngram_size=3


  0%|          | 0/100 [00:00<?, ?it/s]

Input length of input_ids is 55, but `max_length` is set to 40. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Bm25_AverageLoc: 861.25   LaBSE_AverageLoc: 723.47   
-----------------------------
Bm25_AverageRelLoc: 17.58   LaBSE_AverageRelLoc: 14.82   
-----------------------------
Bm25_Top@1: 0.35   LaBSE_Top@1: 0.6   
Bm25_Top@3: 0.67   LaBSE_Top@3: 1.47   
Bm25_Top@5: 0.9   LaBSE_Top@5: 2.17   
-----------------------------
Bm25_FDARO@v1: 0.41   LaBSE_FDARO@v1: 0.75   
Bm25_FDARO@v2: 0.74   LaBSE_FDARO@v2: 0.99   
-----------------------------
Bm25_UpQuartile: 0.0   LaBSE_UpQuartile: 0.0   


____________________________________________________________________________________________________


In [27]:
NO_REPEAT_NGRAM_SIZE = select_best_param(RANKING_MODEL_NAME, RANKING_METRIC_NAME, results)
NO_REPEAT_NGRAM_SIZE

3

In [28]:
print(
f'''max_length={MAX_LENGTH}, early_stopping={EARLY_STOPPING}, do_sample={DO_SAMPLE}, num_beams={NUM_BEAMS}, temperature={TEMPERATURE},
  top_k={TOP_K}, top_p={TOP_P}, no_repeat_ngram_size={NO_REPEAT_NGRAM_SIZE}
''')

max_length=40, early_stopping=True, do_sample=True, num_beams=1, temperature=0.9,
  top_k=50, top_p=1.0, no_repeat_ngram_size=3



In [31]:
round(100*(0.75 - 0.16 ) / 0.16)

369