In [1]:
from datasets import load_dataset
import numpy as np
import torch
from p_tqdm import p_map
# Load model directly
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, pipeline
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


#### Use the FIQA-PL dataset that was used in lab 1 and lab lab 2 (so we need the passages, the questions and their relations).

Got confiused with FIQA-PL, because indeed it doesn't provide relations between questions and corpus
instead used truthful_qa (small english dataset with questions with multiple possible answers)

In [2]:
from datasets import load_dataset

ds = load_dataset("truthfulqa/truthful_qa", "generation")
ds

DatasetDict({
    validation: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source'],
        num_rows: 817
    })
})

#### Create a dataset of positive and negative sentence pairs.

- In each pair the first element is a question and the second element is a passagei, i.e. "{question} {separator} {passage}", where separator should be a separator taken from the model's tokenizer.
- Use the relations to mark the positive pairs (i.e. pairs where the question is answered by the passage).
- Use your own strategy to mark negative pairs (i.e. you can draw the negative examples, but there are better strategies to define the negative examples). The number of negative examples should be larger than the number of positive examples.

In [3]:
positive_pairs, negative_pairs = [], []

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [5]:
separator = tokenizer.sep_token or "[SEP]"
separator

'[SEP]'

In [None]:
def generate_pairs(question, answers):
    pairs = []
    for answer in answers:
        pairs.append(question + ' ' + separator + ' ' + answer)
    return pairs

positive_pairs = [generate_pairs(q, a) for q, a in zip(ds['validation']['question'], ds['validation']['correct_answers'])]
# positive_pairs = p_map(generate_pairs, ds['validation']['question'], ds['validation']['correct_answers']) # p_map is not working

In [13]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


#  72 mln downloads last month Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')


def get_top_n_similar_answers(correct_answers, incorrect_answers, candidate_incorrect_answers, top_n=None):

    reference_vectors = model.encode(incorrect_answers)
    candidate_vectors = model.encode(candidate_incorrect_answers)
    
    combined_reference_vector = np.mean(reference_vectors, axis=0)
    
    # cosine similarities between the combined reference vector and all candidate answers
    cos_similarities = cosine_similarity([combined_reference_vector], candidate_vectors).flatten()
    
    # omit correct answers from candidates
    filtered_indices = [
        idx for idx, candidate in enumerate(candidate_incorrect_answers) 
        if candidate not in correct_answers
    ]

    
    filtered_similarities = cos_similarities[filtered_indices]
    sorted_indices = np.argsort(filtered_similarities)[::-1][:top_n]
    
    top_n_candidates = [candidate_incorrect_answers[filtered_indices[idx]] for idx in sorted_indices]

    return top_n_candidates



def generate_incorrect_answers(index):
    correct_answers = ds['validation']['correct_answers']
    incorrect_answers = ds['validation']['incorrect_answers']
    candidate_incorrect_answers = [
        incorrect for idx, incorrect in enumerate(incorrect_answers) if idx != index
    ]
    top_n = 3
    # flaten the list of candidate incorrect answers
    candidate_incorrect_answers = [item for sublist in candidate_incorrect_answers for item in sublist]
    # Get top N most similar incorrect answers
    similar_incorrect_answers = get_top_n_similar_answers(
        correct_answers[index],
        incorrect_answers[index],
        candidate_incorrect_answers,
        top_n
    )
    return similar_incorrect_answers + incorrect_answers[index]


ds_len = len(ds['validation']['correct_answers'])

# new_incorrect_answers = p_map(
#     generate_incorrect_answers,
#     range(ds_len)
# )

# p_map stopped working

new_incorrect_answers = [
    generate_incorrect_answers(i) for i in range(ds_len)
]

In [None]:
negative_pairs = [generate_pairs(q, a) for q, a in zip(ds['validation']['question'], new_incorrect_answers)]

In [45]:
flatten_negative_pairs = [item for sublist in negative_pairs for item in sublist]
flatten_positive_pairs = [item for sublist in positive_pairs for item in sublist]

In [47]:
print(f'positive pairs {len(flatten_positive_pairs)} and negative pairs {len(flatten_negative_pairs)}')

positive pairs 2600 and negative pairs 5437


Added three incorrect answers to each questions derived from the incorrect answers of other questions, utilizing cosine similarity to ensure relevance

#### The dataset from point 2 should be split into training, evaluation and testing subsets.

In [48]:

all_pairs = positive_pairs + negative_pairs
np.random.shuffle(all_pairs)

#  split 70% train, 15% eval, 15% test
total_size = len(all_pairs)
train_size = int(0.7 * total_size)
eval_size = int(0.15 * total_size)
test_size = total_size - train_size - eval_size  # Ensure all data is used

# Create labels: 1 for positive pairs, 0 for negative pairs
labels = [1] * len(positive_pairs) + [0] * len(negative_pairs)

# Create a dataset
data = {'text': all_pairs, 'label': labels}
dataset = Dataset.from_dict(data)

train_dataset = dataset.select(range(train_size))
eval_dataset = dataset.select(range(train_size, train_size + eval_size))
test_dataset = dataset.select(range(train_size + eval_size, total_size))


#### Train a text classifier using the Transformers library
distinguishes between the positive and the negative pairs. To make the process manageable use models of size base and a runtime providing GPU/TPU acceleration. Consult the discussions related to fine-tuning Transformer models to select sensible set of parameters. You can also run several trainings with different hyper-parameters, if you have access to large computing resources.

Top similar incorrect candidates: ['Vienna', 'Hambourg', 'Freedom', 'hamburger', 'I am a big boy']
