# QA Dataset

In [1]:
# Load QA dataset subset (change product segment in qa_{segment}.json)

#!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon/qa/qa_Electronics.json.gz

In [2]:
# This data loading code is provided with the dataset information:
# https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/qa/

import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

QA = getDF('qa_Electronics.json.gz')
QA = QA[QA['answerType'] != '?']

In [3]:
# Unique products included in the QA dataset

len(QA['asin'].unique())

39284

In [4]:
# Number of questions of each type

QA['questionType'].value_counts()

questionType
open-ended    148665
yes/no         87536
Name: count, dtype: int64

In [None]:
# Exemple questions of each type

QA[QA['questionType'] == 'yes/no'].sample(frac=1).reset_index(drop=True).head(10)

In [None]:
QA[QA['questionType'] == 'open-ended'].sample(frac=1).reset_index(drop=True).head(10)

In [5]:
#!pip install datasets
from datasets import Dataset
QA = Dataset.from_pandas(QA)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import numpy as np

yn_indices = np.where(np.array(QA['questionType']) == 'yes/no')[0]
open_indices = np.where(np.array(QA['questionType']) == 'open-ended')[0]

In [7]:
QA_yn = QA.select(yn_indices)
QA_open = QA.select(open_indices)

# Reviews Dataset

In [8]:
# Load reviews dataset. Change product segment in "raw_review_{segment}"

from datasets import load_dataset

reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True)
reviews = reviews['full'].remove_columns(['rating','images','parent_asin','user_id','timestamp'])

# Keep only reviews with more than 150 characters
from tqdm import tqdm
long_review_indices = []

for i, text in enumerate(tqdm(reviews['text'])):
    if len(text) >= 100:
        long_review_indices.append(i)

reviews = reviews.select(long_review_indices)

100%|██████████| 43886944/43886944 [00:42<00:00, 1031724.42it/s]


In [9]:
# Keep only products with more than 100 reviews

from collections import Counter
counter_reviews = Counter(reviews['asin'])

plus100reviews_indices = []

for i, asin in enumerate(tqdm(reviews['asin'])):
  num_reviews = counter_reviews[asin]

  if num_reviews > 100:
    plus100reviews_indices.append(i)

reviews = reviews.select(plus100reviews_indices)

100%|██████████| 24870555/24870555 [00:31<00:00, 792858.08it/s]


# Mutual products

* Check number of mutual products

In [10]:
# Product IDs present in both QA and review datasets

ids_QA = set(QA['asin'])

ids_reviews = set(reviews['asin'])

common_ids = ids_QA.intersection(ids_reviews)

print(f'There are {len(common_ids)} common products')

There are 8571 common products


* Filter datasets so that only common products are kept

In [11]:
QA_yn_common_indices = np.where(np.isin(QA_yn['asin'], list(common_ids)))[0]
QA_open_common_indices = np.where(np.isin(QA_open['asin'], list(common_ids)))[0]
reviews_common_indices = np.where(np.isin(reviews['asin'], list(common_ids)))[0]

In [12]:
QA_yn = QA_yn.select(QA_yn_common_indices)
QA_open = QA_open.select(QA_open_common_indices)
reviews = reviews.select(reviews_common_indices)

* Total number of questions after filtering out the ones about products with <100 reviews

In [14]:
total_open= len(QA_open)
total_yn = len(QA_yn)

print(f'There is a total of {total_yn} YN questions\nThere is a total of {total_open} open questions')

There is a total of 21789 YN questions
There is a total of 38895 open questions


* Count mean number of reviews and questions

In [27]:
counter_QA_yn = Counter(QA_yn['asin'])
counter_QA_open = Counter(QA_open['asin'])
counter_reviews = Counter(reviews['asin'])

QA_yn_num_per_product = []
QA_open_num_per_product = []
reviews_num_per_product = []

for asin in common_ids:
  freq_QA = counter_QA_yn[asin]
  freq_open = counter_QA_open[asin]
  freq_reviews = counter_reviews[asin]

  QA_yn_num_per_product.append(freq_QA)
  QA_open_num_per_product.append(freq_open)
  reviews_num_per_product.append(freq_reviews)

In [None]:
print(f'Mean number of YN questions per product: {np.mean(QA_yn_num_per_product):.0f}\nMean number of open questions per product: {np.mean(QA_open_num_per_product):.0f}\nMean number of reviews per product: {np.mean(reviews_num_per_product):.0f}')

Mean number of YN questions per product: 3
Mean number of open questions per product: 5
Mean number of reviews per product: 364


# Review texts segmentation

In [None]:
#%pip install -qU langchain-text-splitters

In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

# If len(text) < chunk_size, it's kept as is (inside a list)

In [22]:
dummy_text = reviews['text'][0]
dummy_split = text_splitter.split_text(dummy_text)
len(dummy_split[0])

187

In [None]:
SEGMENTED_CHUNKS = []

for original_review in reviews:
  product_id = original_review['asin']

  chunks = text_splitter.split_text(original_review['text'])

  # For each chunk, create a new dict in SEGMENTED_CHUNKS
  for i, chunk in enumerate(chunks):
    chunk_dict = {
        "product id": f"{product_id}",
        "content": chunk
    }
    SEGMENTED_CHUNKS.append(chunk_dict)

In [None]:
import random
random.choice(SEGMENTED_CHUNKS)

{'product id': 'B0038JED6M',
 'content': '(I don\'t use a set-top box with my cable).  The scan locked up at 84%.  I gave it about 15 minutes, hoping it would snap out of it, but it never budged past that.  So I canceled out of it and tried again.  Again, locked up at 84%.  Another try, another lock-up.  Finally, I ran through the "factory reset" option then tried again.  Finally, it made it through the scan.  My kids cheered.<br /><br />After that it was okay, but I was disappointed in the lip sync.  I might have missed a setting'}

In [68]:
import pickle

with open('/root/datasegmented_chunks.pkl', 'wb') as f:
    pickle.dump(SEGMENTED_CHUNKS, f)


---
# Load preprocessed datasets and chunks
* **Skips above steps**
---

In [14]:
from datasets import load_dataset

QA_open = load_dataset('csv', data_files='/root/data/QA_open_filtered.csv')
QA_yn = load_dataset('csv', data_files='/root/data/QA_yn.csv')
reviews = load_dataset('csv', data_files='/root/data/reviews.csv')

In [15]:
import pickle

chunks_pickle_filepath = 'root/data/segmented_chunks.pkl'

with open(chunks_pickle_filepath, 'rb') as f:
    SEGMENTED_CHUNKS = pickle.load(f)
    print(len(SEGMENTED_CHUNKS))

4614196


In [16]:
SEGMENTED_CHUNKS[0]

{'product id': 'B00CB7W1GG',
 'content': 'Works well, but the corner of the plastic cracked and is broken after a month or so. I think at least 2 of the plastic corners are cracked now. still works but wish it was higher quality.'}

# SentenceTransformer doc encoding & embeddings similarity calculation

In [17]:
# Load embeddings model

from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer("dunzhang/stella_en_400M_v5", trust_remote_code=True).cuda()

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Collect all reviews of the same product

from tqdm import tqdm

from datasets import DatasetDict
from collections import defaultdict

asin_to_texts = defaultdict(list)

for chunk in tqdm(SEGMENTED_CHUNKS):
    asin_to_texts[chunk['product id']].append(chunk['content'])

 17%|█▋        | 797859/4614196 [00:01<00:05, 730038.22it/s]

100%|██████████| 4614196/4614196 [00:06<00:00, 724844.22it/s]


In [19]:
def add_texts(sample):
    sample['reviews'] = asin_to_texts.get(sample['asin'], [])
    return sample

QA_yn = QA_yn['train'].map(add_texts).shuffle(seed=42)
QA_open = QA_open['train'].map(add_texts).shuffle(seed=42)

In [20]:
import random

query_prompt_name = "s2p_query"

dummy_sample = random.choice(QA_open)
dummy_query = [dummy_sample['question']]
dummy_docs = dummy_sample['reviews']

## Dummy search

In [11]:
query_embeddings = embedding_model.encode(dummy_query, prompt_name=query_prompt_name, device='cuda')
doc_embeddings = embedding_model.encode(dummy_docs, device='cuda')
print(query_embeddings.shape, doc_embeddings.shape)

similarities = embedding_model.similarity(query_embeddings, doc_embeddings)
print(similarities)

(1, 1024) (266, 1024)
tensor([[0.5626, 0.6324, 0.3837, 0.3875, 0.4220, 0.4016, 0.4795, 0.5213, 0.4936,
         0.4200, 0.4957, 0.5182, 0.5372, 0.5580, 0.4823, 0.4742, 0.5098, 0.4955,
         0.4860, 0.4250, 0.3059, 0.4953, 0.5476, 0.4414, 0.5582, 0.5829, 0.6952,
         0.5458, 0.6511, 0.6519, 0.4489, 0.5376, 0.5053, 0.5437, 0.5258, 0.4626,
         0.5856, 0.5781, 0.4961, 0.6483, 0.5353, 0.5813, 0.5913, 0.4258, 0.5311,
         0.4337, 0.5365, 0.5716, 0.5378, 0.4130, 0.4468, 0.6127, 0.3792, 0.4789,
         0.4849, 0.4935, 0.5438, 0.5084, 0.6238, 0.3253, 0.4248, 0.4267, 0.6513,
         0.6210, 0.4319, 0.4100, 0.3766, 0.4811, 0.5781, 0.3317, 0.5169, 0.4820,
         0.4892, 0.5029, 0.4563, 0.5014, 0.4633, 0.5103, 0.4945, 0.4597, 0.5205,
         0.5484, 0.6537, 0.5270, 0.5833, 0.4977, 0.5343, 0.4971, 0.6056, 0.5605,
         0.5640, 0.5095, 0.5023, 0.4459, 0.4084, 0.5889, 0.4660, 0.6096, 0.4272,
         0.5112, 0.4582, 0.5511, 0.4865, 0.5823, 0.4909, 0.4709, 0.5011, 0.4249,
      

In [12]:
import numpy as np

_, top_indices = torch.topk(similarities[0], k = 5)
top_similar_reviews = [dummy_docs[idx] for idx in top_indices]
top_similar_reviews

["/><br />- It's easy to use and very intuitive for first timers.<br /><br />Now, its drawbacks. Realize that I am rating this device from the viewpoint of a geocacher, and we're probably the most demanding users of GPS receivers out there:<br /><br />- It is not an accurate device if you're attempting to get a definitive lock on a waypoint within 50' of your goal. Once in the general search area, the signal will likely bounce around and give you wildly varying readings. For example, if you've",
 "I have had this model for several years and am now considering upgrading.  It does a great job of pointing me in the right direction when I'm headed somewhere new.  Good for geocaching, although under heavy tree cover it can be far off the mark.  To get the most out of it, you really need to hook it up to a computer so you can download/upload data.  For the PC, I recommend ExpertGPS.",
 'This little gps has everything you need for geocaching if you are as technically challenged as I am.  We h

# FAISS index method
* Not used

In [13]:
#%pip install flash_attn
#%pip install -qU langchain-community faiss-gpu
#%pip install sentence-transformers
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
import faiss
import torch
import numpy as np

In [None]:
from sentence_transformers import SentenceTransformer

model_name = "dunzhang/stella_en_400M_v5"
encode_kwargs = {'batch_size': 32}
model_kwargs = {'trust_remote_code': True}

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    multi_process = True,
    show_progress = True
)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Initialize similarity search index
index = faiss.IndexFlatL2(1024)

dummy_vector_store = FAISS(
    embedding_function=hf,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [26]:
dummy_chunks = [Document(page_content=chunk['content'], metadata = {'ID': chunk['product id']}) for chunk in SEGMENTED_CHUNKS[500:]]

## Dummy search

In [23]:
dummy_ids = set([chunk.metadata['ID'] for chunk in dummy_chunks])
dummy_QA_yn = QA_yn.filter(lambda example: example['asin'] in dummy_ids)

Filter: 100%|██████████| 21789/21789 [00:00<00:00, 56360.72 examples/s]


### Add documents to index

In [None]:
dummy_vector_store.add_documents(documents = dummy_chunks)

In [58]:
# Delete docs from vector store

doc_ids = list(dummy_vector_store.index_to_docstore_id.values())

dummy_vector_store.delete(ids=doc_ids)

True

In [25]:
import random

dummy_query = random.choice(dummy_QA_yn['train'])

dummy_search_results = dummy_vector_store.similarity_search(query=dummy_query['question'], k=5, filter = {'ID': dummy_query['asin']})
print(f"Question: {dummy_query['question']} | Product ID: {dummy_query['asin']}")
for doc in dummy_search_results:
    print(f"* {doc.page_content} [{doc.metadata}]")

Question: is it wireless | Product ID: B008CXTX7S


# RAG

## YN questions

In [21]:
import transformers

llm = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    'text-generation',
    model = llm,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
review_embeddings_by_asin = {}

max_generated_anwers = 100

generated_answers = []
retrieved_reviews = []

system_prompt = "You are an assistant in an online store. Your task is to answer questions sent in by customers about products they want to buy. "\
"Your answer should be based on texts containing reviews from customers who have bought the product before. "\
"The right answer is either YES or NO. "\
"Analyze the review texts provided and identify relevant pieces of information that can help answer the question. "\
"After analysing, you must classify each relevant piece of information between that which supports the YES response and that which supports the NO response. Keep a score of the number of supporting information for each answer. "\
f"Your answer should be formatted according to the following structure. The [Final Answer] field must be filled and must be either 'YES' or 'NO', according to the one that gets the highest supporting information score. If the scoring results in a draw, [Final Answer] must be 'Inconclusive'\n\n"\
f"[Analysis]\n"\
f"[YES | NO scores]\n"\
"[Final Answer]"

for idx, sample in enumerate(QA_yn):
    if idx == max_generated_anwers:
        break

    asin = sample['asin']
    question = [sample['question']]
    reviews = sample['reviews']

    if asin not in review_embeddings_by_asin:
        review_embeddings_by_asin[asin] = embedding_model.encode(reviews)
    
    query_embeddings = embedding_model.encode(question, prompt_name = query_prompt_name)
    review_embeddings = review_embeddings_by_asin[asin]

    similarities = embedding_model.similarity(query_embeddings, review_embeddings)

    sim_values, top_indices = torch.topk(similarities[0], k = 7)

    top_similar_reviews = [reviews[i] for i in top_indices]

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Question: {question}\nReview texts: {top_similar_reviews}"}
    ]

    outputs = pipeline(
        messages,
        max_new_tokens = 1500
    )

    generated_answer = outputs[0]["generated_text"][-1]['content']

    print(f"Finished generating answer number {idx} of {max_generated_anwers}")

    generated_answers.append(generated_answer)
    retrieved_reviews.append(top_similar_reviews)

    if idx%10 == 0:
        with open('/root/outputs/run03/generated_answers.pkl', 'wb') as f:
            pickle.dump(generated_answers, f)

        with open('/root/outputs/run03/retrieved_reviews.pkl', 'wb') as f:
            pickle.dump(retrieved_reviews, f)


* Extract final YES/NO answer and calculate metrics

In [16]:
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score

generated_answerTypes = []
true_answerTypes = []

def parse_outputs(generated_answer):
    match = re.search(r'\[FINAL ANSWER\].*?(YES|NO)', generated_answer, re.IGNORECASE | re.DOTALL)

    if match:
        answer = match.group(1)
        if answer == "NO":
            return 0
        elif answer == "YES":
            return 1
        
    else:
        return 'failed generation'
    
for i in range(len(generated_answers)):
    generated_answerType = parse_outputs(generated_answers[i])
    if generated_answerType == 'failed generation':
        continue
    else:
        generated_answerTypes.append(generated_answerType)


    true_answer = QA_yn[i]['answerType']
    if true_answer == 'Y':
        true_answerType = 1
    else:
        true_answerType = 0
    
    true_answerTypes.append(true_answerType)

accuracy = accuracy_score(true_answerTypes, generated_answerTypes)
precision = precision_score(true_answerTypes, generated_answerTypes)
recall = recall_score(true_answerTypes, generated_answerTypes)

print(f"Y|N questions\n\nAccuracy: {accuracy:.2f}\nPrecision: {precision:.2f}\nRecall: {recall:.2f}")

Y|N questions

Accuracy: 0.67
Precision: 0.90
Recall: 0.69


## Y|N examples

In [17]:
for i in range(20):
    sample = QA_yn[i]
    generation_sample = (f"Question:\n{sample['question']}\n\nOriginal answer:\n{sample['answer']}\n\nGenerated answer: {generated_answers[i]}\n\n"
    f"Retrieved reviews: {retrieved_reviews[i]}\n----------------------------------------------------------------------------------------")
    print(generation_sample)

Question:
does this work in my sony cyber-shot DSC-H3

Original answer:
According to the manual, yes. I might also point out, however, that you can get a 32 GB microSD stick with a Memory Stick Pro Dual adapter for half the price, and when your camera dies you can use it in something else. Nothing uses the Sony Memory stick any more, because they were way too expensive, being proprietary.

Generated answer: [Analysis]
- The customer who bought the product for their Sony Cybershot-DSC-H1 camera was told it would work by Sony tech support, but it didn't fit.
- The customer who bought the product for their Sony V3 camera mentioned it works fine.
- The customer who bought the product for their Sony Cybershot mentioned it works with their camera.
- The customer who bought the product for their Sonycyber Shot DSC-G3 mentioned it's not compatible with their camera.
- The customer who bought the product for their Sony cyber shot P -73 mentioned it works fine with their camera.
- The customer w

# Open questions

In [None]:
#review_embeddings_by_asin = {}

max_generated_anwers = 100

generated_answers = []
retrieved_reviews = []

system_prompt = "You are an assistant in an online store. Your task is to answer questions sent in by customers about products they want to buy. "\
"Your answer should be based on texts containing reviews from customers who have bought the product before. "\
"Analyze the review texts provided and identify relevant pieces of information that can help answer the question. "\
f"Your answer should be formatted according to the following structure. The [Final Answer] field must be filled and must concisely present the answer you developed as a conclusion of the analysis:\n\n"\
f"[Analysis]\n"\
"[Final Answer]"

for idx, sample in enumerate(QA_open):

    asin = sample['asin']
    question = [sample['question']]
    reviews = sample['reviews']

    if asin not in review_embeddings_by_asin:
        review_embeddings_by_asin[asin] = embedding_model.encode(reviews)
    
    query_embeddings = embedding_model.encode(question, prompt_name = query_prompt_name)
    review_embeddings = review_embeddings_by_asin[asin]

    similarities = embedding_model.similarity(query_embeddings, review_embeddings)

    sim_values, top_indices = torch.topk(similarities[0], k = 7)

    top_similar_reviews = [reviews[i] for i in top_indices]

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Question: {question}\nReview texts: {top_similar_reviews}"}
    ]

    outputs = pipeline(
        messages,
        max_new_tokens = 1000
    )

    generated_answer = outputs[0]["generated_text"][-1]['content']

    print(f"Finished generating answer number {idx} of {max_generated_anwers}")

    generated_answers.append(generated_answer)
    retrieved_reviews.append(top_similar_reviews)

with open('/root/generated_answers_open.pkl', 'wb') as f:
    pickle.dump(generated_answers, f)

with open('/root/retrieved_reviews_open.pkl', 'wb') as f:
    pickle.dump(retrieved_reviews, f)


In [30]:
for i in range(20):
    sample = QA_open[i]
    generation_sample = (f"Question:\n{sample['question']}\n\nOriginal answer:\n{sample['answer']}\n\nGenerated answer: {generated_answers[i]}\n\n"
    f"Retrieved reviews: {retrieved_reviews[i]}\n----------------------------------------------------------------------------------------")
    print(generation_sample)

Question:
If I wanted to hook this CB up to the cigarette lighter socket, what kind of cord do I need to purchase?

Original answer:
The CB has two wires built in so all you would need is a cigarette lighter socket adapter and wire it up. The wires are about 4' long so you should have no problems where you mount it. The wiring is very easy - that's the set up I have so I can unplug and remove the CB when I'm just driving around locally.

Generated answer: [Analysis]
The customer is asking about the type of cord needed to hook the CB up to the cigarette lighter socket. To answer this question, I will look for information in the review texts about the power cable that comes with the CB and any modifications that customers have made to it.

From the review texts, I see that some customers have mentioned the power cable that comes with the CB. For example, one customer mentions that the power cable was only about 3 feet long, which was not long enough to reach their car battery. Another cu

# RAGAS

## Answer relevancy

In [22]:
with open('/root/outputs/run01/generated_answers_open.pkl', 'rb') as f:
    generated_answers_open = pickle.load(f)

In [42]:
def get_final_answer(text):
    match = re.search(r'\[FINAL ANSWER\]\s*(.*)', text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

get_final_answer(generated_answers_open[4])

"The antenna is approximately 23 feet long and can be used indoors by running the wire outdoors through a window or doorway, clipping it to the radio's telescopic antenna, or hanging it out of a window or on a curtain/blinds."

In [30]:
answer_relevancy_prompt = "Generate 3 different questions for the given answer."\
"The questions can be similar to one another, but they can't be identical"\
"Use the following format:\n\n"\
"[\"QUESTION 1\", \"QUESTION 2\", \"QUESTION 3\"]"

In [None]:
import ast

answer_relevancy_scores = []

for idx, answer in enumerate(generated_answers_open):
    answer = get_final_answer(answer)
    answer.replace('"', '\n"').replace("'", "\n'")

    original_question = QA_open[idx]['question']

    messages = [
        {"role": "system", "content": answer_relevancy_prompt},
        {"role": "user", "content": f"Answer: {answer}"}
    ]

    outputs = pipeline(
        messages,
        max_new_tokens = 1000
    )

    generated_questions = outputs[0]["generated_text"][-1]['content']
    generated_questions = ast.literal_eval(generated_questions)
    gen_embs = embedding_model.encode(generated_questions)

    original_embs = embedding_model.encode(original_question)

    similarities = embedding_model.similarity(original_embs, gen_embs)
    score = similarities.mean()
    answer_relevancy_scores.append(score)

    print(f"Answer {idx}: {score} relevance score")

In [53]:
import numpy as np

np.mean(answer_relevancy_scores)

0.7710869

In [39]:
import ast

lst = ast.literal_eval(generated_questions)
lst

["What additional component is required to connect a CB to a vehicle's electrical system?",
 "What type of adapter is needed to power a CB from a vehicle's cigarette lighter socket?",
 "What adapter is necessary to link a CB to a vehicle's 12V power outlet?"]