In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install rank_bm25

In [None]:
!pip install sentence_transformers

In [7]:
import pandas as pd
import torch
import numpy as np
import pickle

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util

In [8]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f063eb873d0>

In [9]:
dataset = load_dataset('ms_marco', 'v1.1', split='train')



In [10]:
df_train_0 = dataset.to_pandas()

In [11]:
def preprocess_df(df):
    ans_norm = []
    selected = []
    for row in df.iterrows():
        try:
            ans_norm.append(row[1]["answers"][0])
        except IndexError:
            ans_norm.append(None)
        selected.append(row[1]["passages"]["is_selected"])
    df["answers_norm"] = ans_norm
    df["selected"] = selected
    df.dropna(inplace=True)
    return df

In [12]:
df_train = preprocess_df(df_train_0)

In [13]:
df_train.head()

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers,answers_norm,selected
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[],Results-Based Accountability is a disciplined ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[],Yes,"[0, 1, 0, 0, 0, 0, 0]"
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[],20-25 minutes,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[],$11 to $22 per square foot,"[0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[],Due to symptoms in the body,"[0, 0, 1, 0, 0, 0, 0, 0]"


In [14]:
if torch.has_mps:
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [15]:
device

device(type='cuda')

In [16]:
NUM_ROWS = 5000

queries = df_train["query"].values[0:NUM_ROWS]

In [17]:
def load_tokenizer_and_model(model_name_or_path):
    return GPT2Tokenizer.from_pretrained(model_name_or_path), GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)


def generate(
    model, tok, text,
    do_sample=True, max_length=60, repetition_penalty=5.0,
    top_k=5, top_p=0.95, temperature=1,
    num_beams=None,
    no_repeat_ngram_size=3
):
    input_ids = tok.encode(text, return_tensors="pt").to(device)
    out = model.generate(
        input_ids.to(device), 
        min_length=2, 
        max_length=max_length, 
        eos_token_id=5, 
        # pad_token=1,
        top_k = top_k,
        top_p = top_p,
        no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams
      )
    return list(map(tok.decode, out))

In [15]:
tok, model = load_tokenizer_and_model("sberbank-ai/rugpt3medium_based_on_gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

In [None]:
generated_answers = []

for query in queries:
    generated = generate(model, tok, query + " ", num_beams=5, max_length=60)
    generated_answers.append(generated[0][len(query):])

In [29]:
def save_gen_text(queries, generated_answers):
    query_ans_dict = {k: v for k, v in zip(queries, generated_answers)}
    with open(f'generated_text_large.pickle', 'wb') as f:
        pickle.dump(query_ans_dict, f)

In [30]:
save_gen_text(queries, generated_answers)

In [18]:
with open('generated_text_large.pickle', 'rb') as f:
    generated_text = pickle.load(f)

In [19]:
generated_answers = list(generated_text.values())

In [20]:
def simple_tok(sent: str):
    return sent.split()


def preprocess_get_text(sent: str):
    sent = sent.replace("?/A:", "")
    sent = sent.replace('\n', '')
    return sent

In [21]:
gen_ans_proc = [preprocess_get_text(s) for s in generated_answers]

In [35]:
def count_upper_quartile(scores):
    upper_quartile = np.quantile(scores, 0.75)
    if scores[-1] >= upper_quartile:
        return 1.
    return 0.


def count_upper_than_selected(scores, selected, flag="at_least_one"):
    selected = np.array(selected)
    scores = np.array(scores)
    if selected.sum() == 0:
        return None
    res = np.where(selected == 1.)
    selected_idxs = res[0]

    upper_or_not = scores[-1] > scores[selected_idxs]
    upper_or_not = upper_or_not.astype(int)

    if flag == "at_least_one":
        if upper_or_not.sum() > 0:
            return 1.
        return 0.
    else:
        if upper_or_not.sum() == len(upper_or_not):
            return 1.
        return 0.


def get_scores_texts(df, gen_ans_proc):
    corpus = []
    metric_upper_quant = []
    metric_upper_th_selected = []
    metric_upper_th_all_selected = []

    metric_upper_quant_bse = []
    metric_upper_th_selected_bse = []
    metric_upper_th_all_selected_bse = []
    df_tmp = df.copy()
    df_tmp["gen_answers"] = gen_ans_proc
    for row in df_tmp.iterrows():
        # corpus.append(row[1]["passages"]["passage_text"])
        texts = row[1]["passages"]["passage_text"].tolist()
        texts.append(row[1]["gen_answers"])
        corpus.append(texts)
        tok_texts = [simple_tok(s) for s in texts]

        bm25 = BM25Okapi(tok_texts)
        query = simple_tok(row[1]["query"])
        scores = bm25.get_scores(query)
        best_texts = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

        # query_embedding = model_bse.encode(query)
        # scores_bse = []
        # for text in texts:
        #     passage_embedding = model_bse.encode(simple_tok(text))
        #     scores_bse.append(util.dot_score(query_embedding, passage_embedding))

        metric_upper_quant.append(count_upper_quartile(scores))
        metric_upper_th_selected.append(count_upper_than_selected(scores, row[1]["selected"]))
        metric_upper_th_all_selected.append(count_upper_than_selected(scores, row[1]["selected"], "all_sel"))

        # print(scores_bse)
        # metric_upper_quant_bse.append(count_upper_quartile(scores_bse))
        # metric_upper_th_selected_bse.append(count_upper_than_selected(scores_bse, row[1]["selected"]))
        # metric_upper_th_all_selected_bse .append(count_upper_than_selected(scores_bse, row[1]["selected"], "all_sel"))
        
    df_tmp["metric_upper_quant"] = metric_upper_quant
    df_tmp["metric_upper_th_selected"] = metric_upper_th_selected
    df_tmp["metric_upper_th_all_selected"] = metric_upper_th_all_selected

    # df_tmp["metric_upper_quant_bse"] = metric_upper_quant_bse
    # df_tmp["metric_upper_th_selected_bse"] = metric_upper_th_selected_bse
    # df_tmp["metric_upper_th_all_selected_bse"] = metric_upper_th_all_selected_bse
    df_tmp["corpus"] = corpus
    return df_tmp

In [36]:
df_train_metrics = get_scores_texts(df_train[:len(gen_ans_proc)], gen_ans_proc)

In [37]:
df_train_metrics.head()

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers,answers_norm,selected,gen_answers,metric_upper_quant,metric_upper_th_selected,metric_upper_th_all_selected,corpus
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[],Results-Based Accountability is a disciplined ...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",You can use the a href=https://en.wikipedia.o...,0.0,0.0,0.0,"[Since 2007, the RBA's outstanding reputation ..."
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[],Yes,"[0, 1, 0, 0, 0, 0, 0]","ically elected president.In the United States,...",0.0,1.0,1.0,"[In his younger years, Ronald Reagan was a mem..."
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[],20-25 minutes,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","I'm not sure what you're looking for, but I'v...",0.0,0.0,0.0,"[Sydney, New South Wales, Australia is located..."
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[],$11 to $22 per square foot,"[0, 0, 0, 0, 0, 0, 0, 0, 1]",. */ public static void main(String[] a...,0.0,0.0,0.0,"[In regards to tile installation costs, consum..."
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[],Due to symptoms in the body,"[0, 0, 1, 0, 0, 0, 0, 0]",is not the same as the conversion of the mind...,1.0,1.0,1.0,"[Conclusions: In adult body CT, dose to an org..."


In [38]:
print(f"Доля текстов, попавших в первую квартиль по релевантности: {df_train_metrics['metric_upper_quant'].mean()}")
print(f"Доля текстов, выше хотя бы одного релевантного: {np.round(df_train_metrics['metric_upper_th_selected'].mean(), 4)}")
print(f"Доля текстов, выше всех релевантных: {np.round(df_train_metrics['metric_upper_th_all_selected'].mean(), 4)}")

Доля текстов, попавших в первую квартиль по релевантности: 0.1576
Доля текстов, выше хотя бы одного релевантного: 0.1929
Доля текстов, выше всех релевантных: 0.1828


In [24]:
sentences = ["This is an example sentence", "Each sentence is converted"]

model_bse = SentenceTransformer('sentence-transformers/LaBSE')
embeddings = model_bse.encode(sentences)
print(embeddings)

[[ 0.02882476 -0.0060238  -0.05947006 ... -0.03002251 -0.02960703
   0.00067478]
 [-0.05550232  0.02546488 -0.02157256 ...  0.02932104  0.01150039
  -0.00848788]]
