In [1]:
# https://github.com/DataTalksClub/llm-zoomcamp/blob/main/03-vector-search/eval/evaluation-metrics.md

In [2]:
2+2

4

In [3]:
import pandas as pd
from tqdm.auto import tqdm
import json
import sys
import os
from dotenv import load_dotenv
# Load environment variables from .env
load_dotenv("/home/jovyan/.envrc")

True

In [4]:
from sentence_transformers import SentenceTransformer

In [5]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [6]:
from elasticsearch import Elasticsearch, ConnectionError

try:
    es = Elasticsearch(['http://elasticsearch:9200'])  # Use the service name
    if es.ping():
        print("Connected to Elasticsearch")
    else:
        print("Could not connect to Elasticsearch")
except ConnectionError as e:
    print(f"Connection error: {e}")


Connected to Elasticsearch


In [37]:
import json
import os

# Assuming final_results is defined as a list of dictionaries
# Example: final_results = [{'recommendationid': 1, 'question': 'What is the best game?'}, ...]
data_dir = os.path.abspath('../reviews-assistant/data/ground_truth')

# Define the output file path
output_file = os.path.join(data_dir, "ground_truth_retrieval.json")

# Load the JSON file content
with open(output_file, 'r', encoding='utf-8') as json_file:
    ground_truth = json.load(json_file)

In [38]:
ground_truth [-1]

{'document_id': 8862,
 'appid': '2208920',
 'review': {'appid': '2208920',
  'timestamp_query': 1727643597,
  'title': "Assassin's Creed Valhalla",
  'recommendationid': '174722275',
  'author.steamid': '76561199291372999',
  'author.playtimeforever': None,
  'author.playtime_last_two_weeks': 0,
  'author.playtime_at_review': 8996,
  'author.last_played': 1726115935,
  'language': 'english',
  'review': 'The Graphics are amazing. I also enjoyed the story line. I never expected Eivor to be the leader eventually. I thought he was just the caretaker.',
  'voted_up': True,
  'votes_up': 0,
  'timestamp_created': 1726050285,
  'timestamp_updated': 1726050285},
 'question': 'Does the character development meet expectations throughout the game?',
 'answer': "The character development, especially Eivor's role, is positively noted.",
 'section': 'characters'}

In [9]:
from elasticsearch import Elasticsearch, NotFoundError, ConnectionError
from tqdm import tqdm
import json
import os

class ReviewIndexer:
    def __init__(self, es_host='http://elasticsearch:9200', index_name='steam-reviews', model=None):
        self.es = Elasticsearch([es_host])
        self.index_name = index_name
        self.model = model  # Expecting a SentenceTransformer model to encode text
        
        # Check the connection upon initialization
        self.check_connection()
        
        self.index_settings = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0
            },
            "mappings": {
                "properties": {
                    "appid": {"type": "keyword"},
                    "timestamp_query": {"type": "integer"},
                    "title": {"type": "keyword"},
                    "author.steamid": {"type": "keyword"},
                    "author.playtimeforever": {"type": "integer"},
                    "author.playtime_last_two_weeks": {"type": "integer"},
                    "author.playtime_at_review": {"type": "integer"},
                    "author.last_played": {"type": "integer"},
                    "language": {"type": "keyword"},
                    "review": {"type": "text"},
                    "voted_up": {"type": "boolean"},
                    "votes_up": {"type": "integer"},
                    "timestamp_created": {"type": "integer"},
                    "timestamp_updated": {"type": "integer"},
                    "question": {"type": "text"},
                    "answer": {"type": "text"},
                    "section": {"type": "keyword"},
                    "question_vector": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine"
                    },
                    "answer_vector": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine"
                    },
                    "question_answer_vector": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine"
                    },
                }
            }
        }

        # Drop the index if it exists and create a new one
        self.drop_and_create_index()

    def check_connection(self):
        """Check if Elasticsearch connection is established."""
        try:
            self.es.ping()
            print("Connected to Elasticsearch!")
        except ConnectionError:
            print("Failed to connect to Elasticsearch.")

    def drop_and_create_index(self):
        """Delete the existing index if it exists and create a new one."""
        try:
            if self.es.indices.exists(index=self.index_name):
                self.es.indices.delete(index=self.index_name)
                print(f"Index '{self.index_name}' deleted.")
            self.es.indices.create(index=self.index_name, body=self.index_settings)
            print(f"Index '{self.index_name}' created.")
        except Exception as e:
            print(f"Error creating index: {e}")

    def encode_vectors(self, question, answer):
        """Generate vectors for question, answer, and a combined question + answer."""
        question_vector = self.model.encode(question) if question else [0]*384
        answer_vector = self.model.encode(answer) if answer else [0]*384
        combined_text = f"{question} {answer}" if question and answer else ""
        question_answer_vector = self.model.encode(combined_text) if combined_text else [0]*384
        return question_vector, answer_vector, question_answer_vector

    def prepare_document(self, review):
        """Prepare the document to be indexed."""
        question_vector, answer_vector, question_answer_vector = self.encode_vectors(review["question"], review["answer"])
        
        return {
            "appid": review["appid"],
            "timestamp_query": review["review"]["timestamp_query"],
            "title": review["review"]["title"],
            "author.steamid": review["review"]["author.steamid"],
            "author.playtimeforever": review["review"]["author.playtimeforever"],
            "author.playtime_last_two_weeks": review["review"]["author.playtime_last_two_weeks"],
            "author.playtime_at_review": review["review"]["author.playtime_at_review"],
            "author.last_played": review["review"]["author.last_played"],
            "language": review["review"]["language"],
            "review": review["review"]["review"],
            "voted_up": review["review"]["voted_up"],
            "votes_up": review["review"]["votes_up"],
            "timestamp_created": review["review"]["timestamp_created"],
            "timestamp_updated": review["review"]["timestamp_updated"],
            "question": review["question"],
            "answer": review["answer"],
            "section": review["section"],
            "question_vector": question_vector,
            "answer_vector": answer_vector,
            "question_answer_vector": question_answer_vector
        }

    def index_reviews(self, reviews):
        """Index the provided reviews into Elasticsearch."""
        for review in tqdm(reviews):
            # Prepare the document
            doc = self.prepare_document(review)

            # Index the document
            try:
                self.es.index(index=self.index_name, body=doc)
            except Exception as e:
                print(f"Error indexing document with appid {review['appid']}: {e}")

    def load_reviews_from_file(self, file_path):
        """Load reviews from a JSON file."""
        try:
            with open(file_path, 'r') as file:
                reviews = json.load(file)
                return reviews
        except Exception as e:
            print(f"Error loading reviews from file: {e}")
            return []

# Example usage
if __name__ == "__main__":
    from sentence_transformers import SentenceTransformer
    import os

    # Define the data directory and output file path
    data_dir = os.path.abspath('../reviews-assistant/data/ground_truth')
    output_file = os.path.join(data_dir, "ground_truth_retrieval.json")

    # Initialize the model
    model_name = 'multi-qa-MiniLM-L6-cos-v1'
    model = SentenceTransformer(model_name)

    # Load reviews from the specified JSON file
    indexer = ReviewIndexer(model=model)
    reviews = indexer.load_reviews_from_file(output_file)
    
    # Index the loaded reviews
    if reviews:
        indexer.index_reviews(reviews)
    else:
        print("No reviews found to index.")




Connected to Elasticsearch!
Index 'steam-reviews' deleted.
Index 'steam-reviews' created.


100%|██████████| 8863/8863 [09:10<00:00, 16.10it/s]


In [10]:
# Code to check mappings in Elasticsearch
from elasticsearch import Elasticsearch

def check_index_mapping(es_host='http://elasticsearch:9200', index_name='steam-reviews'):
    es = Elasticsearch([es_host])

    try:
        # Get the index mappings
        response = es.indices.get_mapping(index=index_name)
        mapping = response[index_name]['mappings']
        
        # Check for vector fields and their dimensions
        vector_fields = ["question_vector", "answer_vector", "question_answer_vector"]
        
        for field in vector_fields:
            if field in mapping['properties']:
                dims = mapping['properties'][field]['dims']
                print(f"{field}: {dims} dimensions")
            else:
                print(f"{field} not found in index.")
    except Exception as e:
        print(f"Error retrieving index mapping: {e}")

# Run the check
check_index_mapping()


question_vector: 384 dimensions
answer_vector: 384 dimensions
question_answer_vector: 384 dimensions


In [11]:
from sentence_transformers import SentenceTransformer

def check_vector_dimensions(query, model_name='multi-qa-MiniLM-L6-cos-v1'):
    # Load the model
    model = SentenceTransformer(model_name)

    # Encode the query and get the vector
    vector = model.encode(query)

    # Check the dimensions of the vector
    vector_dimensions = len(vector)
    print(f"Generated vector has {vector_dimensions} dimensions.")

# Example Usage
query = 'Is God of War: Ragnarok a game for kids?'
check_vector_dimensions(query)

Generated vector has 384 dimensions.


In [12]:
from elasticsearch import Elasticsearch

def verify_appid_exists(es_host='http://elasticsearch:9200', index_name='steam-reviews', appid='2322010'):
    es = Elasticsearch([es_host])

    try:
        # Define a query to check if the appid exists
        search_query = {
            "query": {
                "term": {
                    "appid": appid  # Filter by appid
                }
            }
        }

        # Perform the search
        es_results = es.search(index=index_name, body=search_query)

        if es_results['hits']['total']['value'] > 0:
            print(f"AppID '{appid}' exists in the index.")
            for hit in es_results['hits']['hits']:
                print(hit['_id'])
        else:
            print(f"AppID '{appid}' not found in the index.")

    except Exception as e:
        print(f"Error checking for AppID: {e}")

# Run the check for appid 2322010 (God of War: Ragnarok)
# verify_appid_exists(appid="552520")
verify_appid_exists(appid="2322010")

AppID '2322010' exists in the index.
R45wXZIBH8YBboX2c5vT
SI5wXZIBH8YBboX2dJsX
SY5wXZIBH8YBboX2dJtR
So5wXZIBH8YBboX2dJuH
S45wXZIBH8YBboX2dJu_
7I5wXZIBH8YBboX2nJth
7Y5wXZIBH8YBboX2nJu-
7o5wXZIBH8YBboX2nJv-
745wXZIBH8YBboX2nZtE
8I5wXZIBH8YBboX2nZuE


In [13]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

class ReviewReader:
    def __init__(self, es_host='http://elasticsearch:9200', index_name='steam-reviews', model=None):
        self.es = Elasticsearch([es_host])
        self.index_name = index_name
        self.model = model  # Ensure the model is passed for generating embeddings

    def read_all_reviews(self):
        """Retrieve all reviews from the index."""
        try:
            response = self.es.search(index=self.index_name, body={"query": {"match_all": {}}})
            return response['hits']['hits']  # returns the list of documents
        except Exception as e:
            print(f"Error retrieving documents: {e}")
            return []

    def read_review_by_appid(self, appid):
        """Retrieve a specific review by appid."""
        try:
            response = self.es.search(index=self.index_name, body={
                "query": {
                    "term": {
                        "appid": appid  # Search by appid
                    }
                }
            })
            return response['hits']['hits']  # returns the list of documents matching the appid
        except Exception as e:
            print(f"Error retrieving document with appid {appid}: {e}")
            return []

    def read_reviews_knn(self, query, title, vector_field="answer_vector", num_results=5):
    # def read_reviews_knn(self, query, vector_field="answer_vector", num_results=5):
        """Retrieve reviews using KNN search only to test vector querying."""
        try:
            # Ensure the model is set
            if self.model is None:
                raise ValueError("Model for embedding generation is not initialized.")

            # Generate the embedding vector from the query
            # knn_vector = self.model.encode(query).tolist()

            # Define the KNN query
            knn_query = {
                "field": vector_field,  # Use the specified vector field (e.g., "answer_vector")
                "query_vector": knn_vector,
                "k": num_results,
                "num_candidates": 10000,  # Adjust as necessary
                "filter": {
                    "term": {
                        "title": title  # Filter by title
                    }
                }
            }

            # Perform the search
            es_results = self.es.search(index=self.index_name, body={"knn": knn_query})

            result_docs = []

            # Collect results
            for hit in es_results['hits']['hits']:
                result_docs.append(hit['_source'])  # Append only the source of each document

            if not result_docs:
                print("No results found.")
            return result_docs

        except Exception as e:
            print(f"Error executing KNN search: {e}")
            return []
    

    def read_reviews_knn_and_keyword(self, field, query, vector, title, num_results=5):
    # def read_reviews_knn_and_keyword(self, field, query, vector, num_results=5):
        """Retrieve reviews using both KNN and keyword search in a single query."""
        try:
            # Ensure the model is set
            if self.model is None:
                raise ValueError("Model for embedding generation is not initialized.")

            # Define the KNN query
            knn_query = {
                "field": field,  # Use the specified vector field (e.g., "answer_vector")
                "query_vector": vector,
                "k": num_results,
                "num_candidates": 10000,  # Adjust as necessary
                "boost": 0.5,
                "filter": {
                    "term": {
                        "title": title  # Filter by title
                    }
                }
            }

            # Define the keyword search query
            keyword_query = {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["question^3", "answer", "section"],  # Relevant fields for keyword search
                            "type": "best_fields",
                            "boost": 0.5
                        }
                    },
                    "filter": {
                        "term": {
                            "title": title  # Filter by title
                        }
                    }
                }
            }

            # Combine the KNN and keyword search queries
            search_query = {
                "knn": knn_query,
                "query": keyword_query,
                "size": num_results,  # Limit the number of results
                "_source": ["answer", "section", "question", "title"]
            }

            # Perform the search
            es_results = self.es.search(index=self.index_name, body=search_query)
            result_docs = []

            # Collect results
            for hit in es_results['hits']['hits']:
                result_docs.append(hit['_source'])  # Append only the source of each document

            return result_docs

        except Exception as e:
            print(f"Error executing KNN and keyword search: {e}")
            return []


In [14]:
# Example Usage

# Initialize the SentenceTransformer model for embedding generation
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

# Create a reader instance with the model
reader = ReviewReader(model=model)

In [15]:
# Example of reading all indexed reviews
all_reviews = reader.read_all_reviews()
print("All Indexed Reviews:")
for review in all_reviews:
    print(review['_id'])  # Print the id of each review
    # print(review['_source'])  # Print the source of each review

All Indexed Reviews:
a45wXZIBH8YBboX2fJtf
bI5wXZIBH8YBboX2fJuX
bY5wXZIBH8YBboX2fJvQ
bo5wXZIBH8YBboX2fZsI
b45wXZIBH8YBboX2fZtF
cI5wXZIBH8YBboX2fZt9
cY5wXZIBH8YBboX2fZuw
co5wXZIBH8YBboX2fZvi
c45wXZIBH8YBboX2fpsZ
dI5wXZIBH8YBboX2fptN


In [16]:
# Example of reading specific review by appid
specific_reviews = reader.read_review_by_appid("2322010")
print("\nSpecific Reviews for appid 2322010  :")
for review in specific_reviews:
    print("-" * 79)
    print(review['_id'])  # Print the source of the specific review
    print(review['_source'])  # Print the source of the specific review


Specific Reviews for appid 2322010  :
-------------------------------------------------------------------------------
R45wXZIBH8YBboX2c5vT
{'appid': '2322010', 'timestamp_query': 1727643597, 'title': 'God of War: Ragnarok', 'author.steamid': '76561198061949742', 'author.playtimeforever': None, 'author.playtime_last_two_weeks': 2323, 'author.playtime_at_review': 15, 'author.last_played': 1727607569, 'language': 'english', 'review': "Was able to connect my PSN account in about 5 seconds. Now I never have to worry about it again.\n\nCan't wait to to yell at Atreus again, it's been a while.\n\n10/10", 'voted_up': True, 'votes_up': 5, 'timestamp_created': 1726772258, 'timestamp_updated': 1726772258, 'question': 'How quickly can I connect my PSN account to the game?', 'answer': 'In about 5 seconds.', 'section': 'connection', 'question_vector': [0.010507360100746155, -0.06713754683732986, -0.024917500093579292, -0.006777346134185791, -0.02196761965751648, 0.016613567247986794, -0.01877200603

In [17]:
ground_truth = [
    {
        'title': 'God of War: Ragnarok',
        'question': "Is this game for kids?"
    },
    {
        'title': 'Far Cry 5',
        'question': "Is this game open world with outpost capture mechanics?"
    },
    # Add more ground truth examples as needed
]

In [18]:
# def hit_rate(relevance_total):
#     cnt = 0
#     print("cnt:", cnt)

#     for line in relevance_total:
#         if True in line:
#             cnt = cnt + 1

#     print("cnt:", cnt)

#     return cnt / len(relevance_total)

In [19]:
def hit_rate(relevance_total):
    # Count the total number of True values
    total_true_count = sum(sum(line) for line in relevance_total)

    # Calculate the hit rate (percentage of True values over total values)
    total_values = sum(len(line) for line in relevance_total)  # total number of elements
    hit_rate_value = total_true_count / total_values if total_values > 0 else 0.0

    print("Total count of True values:", total_true_count)
    print("Hit Rate:", hit_rate_value)

    return hit_rate_value


In [20]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [21]:
def mrr(relevance_total):
    total_score = 0.0
    total_documents = 0  # To count all documents
    num_lines = len(relevance_total)

    # Calculate the total score by summing the reciprocal ranks of True values
    for line_index, line in enumerate(relevance_total):
        line_score = 0.0  # Score for the current line
        for rank, is_relevant in enumerate(line):
            total_documents += 1  # Count each document
            if is_relevant:
                line_score += 1 / (rank + 1)  # Add the reciprocal rank
                print(f"Line {line_index + 1}: Found relevant at rank {rank + 1}, score added: {1 / (rank + 1)}")
            else:
                print(f"Line {line_index + 1}: Found irrelevant at rank {rank + 1}")

        # If there's a relevant item in the line, accumulate the score
        if line_score > 0:
            total_score += line_score
            print(f"Line {line_index + 1}: Total line score: {line_score}")

    # Calculate the mean reciprocal rank using total_documents
    mean_reciprocal_rank = total_score / total_documents if total_documents > 0 else 0.0
    print(f"Total Score: {total_score}, Total Documents: {total_documents}, Mean Reciprocal Rank: {mean_reciprocal_rank}")

    return mean_reciprocal_rank


In [22]:
field='question_answer_vector'
question="Is God of War launching with controller?"
title="God of War: Ragnarok"
# title="Far Cry 5"

In [23]:
v_q = model.encode(question)

In [24]:
response = reader.read_reviews_knn_and_keyword(field=field, query=question, vector=v_q, title=title, num_results=5)
# response = reader.read_reviews_knn_and_keyword(field=field, query=question, vector=v_q, num_results=5)

In [25]:
response

[{'title': 'God of War: Ragnarok',
  'question': 'Is the gameplay experience in God of War: Ragnarok similar to that of God of War 2018?',
  'answer': 'Yes, it invokes strong nostalgia like GOW 2018.',
  'section': 'game mechanics'},
 {'title': 'God of War: Ragnarok',
  'question': 'Is God of War: Ragnarok worth playing upon release?',
  'answer': 'Yes, the reviewer considers it a must-play game.',
  'section': 'value'},
 {'title': 'God of War: Ragnarok',
  'question': 'Is God of War: Ragnarok well-optimized for PC users?',
  'answer': 'Yes, the game runs very well and is well optimized.',
  'section': 'optimization'},
 {'title': 'God of War: Ragnarok',
  'question': 'Is God of War: Ragnarok optimized for performance on PC?',
  'answer': 'Reportedly, it has performance problems.',
  'section': 'functionality'},
 {'title': 'God of War: Ragnarok',
  'question': 'Is there any multiplayer feature in God of War: Ragnarok?',
  'answer': 'No, it is a single-player experience.',
  'section': '

In [30]:
def question_text_hybrid(q):
    field='question_answer_vector'
    # question="Is God of War launching with proper controller support?"
    # title="God of War: Ragnarok"
    question = q['question']
    title = q['title']
    
    v_q = model.encode(question)

    return reader.read_reviews_knn_and_keyword(field=field, query=question, title=title, vector=v_q, num_results=5)
    # return reader.read_reviews_knn_and_keyword(field=field, query=question, vector=v_q, num_results=5)

In [31]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        print("q:\n", q)
        
        doc_id = q['title']
        print("doc_id:\n", doc_id)
        
        results = search_function(q)
        # print("results:\n", results)
        
        relevance = [d['title'] == doc_id for d in results]
        print("relevance:\n", relevance)
        
        relevance_total.append(relevance)
        print("relevance_total:\n", relevance_total)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [35]:
ground_truth_sample = [
    {
        "title": "God of War: Ragnarok",
        "question": "Is God of War launching with controller?"
    },
    # Add more ground truth examples as needed
]

In [36]:
evaluate(ground_truth=ground_truth_sample, search_function=question_text_hybrid)

100%|██████████| 1/1 [00:00<00:00, 35.38it/s]

q:
 {'title': 'God of War: Ragnarok', 'question': 'Is God of War launching with controller?'}
doc_id:
 God of War: Ragnarok
relevance:
 [True, True, True, True, True]
relevance_total:
 [[True, True, True, True, True]]
Total count of True values: 5
Hit Rate: 1.0
Line 1: Found relevant at rank 1, score added: 1.0
Line 1: Found relevant at rank 2, score added: 0.5
Line 1: Found relevant at rank 3, score added: 0.3333333333333333
Line 1: Found relevant at rank 4, score added: 0.25
Line 1: Found relevant at rank 5, score added: 0.2
Line 1: Total line score: 2.283333333333333
Total Score: 2.283333333333333, Total Documents: 5, Mean Reciprocal Rank: 0.45666666666666667





{'hit_rate': 1.0, 'mrr': 0.45666666666666667}

In [39]:
from openai import OpenAI

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  # this is also the default, it can be omitted
)

In [40]:
def build_prompt(query, search_results):
    answer_llm = ""

    if 'entry_template' in globals():  # Check if entry_template exists in global scope
        for doc in search_results:
            answer_llm += entry_template.format(**doc) + "\n\n"
    else:
        for doc in search_results:
            # Fallback formatting if entry_template is missing
            answer_llm += str(doc) + "\n\n"

    prompt = prompt_template.format(question=query, answer_llm=answer_llm).strip()
    return prompt

In [41]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [47]:
def rag(query, model='gpt-4o-mini'):
    # search_results = search(query)
    search_results = reader.read_reviews_knn_and_keyword(field=field, query=question, title=title, vector=v_q, num_results=5)
    print(search_results)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    # print(answer)
    return answer

In [43]:
prompt_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [46]:
ground_truth[0]

{'document_id': 0,
 'appid': '2208920',
 'review': {'appid': '2208920',
  'timestamp_query': 1727643597,
  'title': "Assassin's Creed Valhalla",
  'recommendationid': '174819428',
  'author.steamid': '76561198072094217',
  'author.playtimeforever': None,
  'author.playtime_last_two_weeks': 0,
  'author.playtime_at_review': 9348,
  'author.last_played': 1726169109,
  'language': 'english',
  'review': "After putting in the same amount of hours that i needed to 100% finish Odyssey here I am finishing 100% of Valhalla. It was a nice adventure with a lot of fun and cool quests and locations.\n\n[h1] The good [/h1]\n\nThey really nailed the viking experience I felt watching the Vikings tv show from 2013. Even tho it felt like a mini game the whole raiding settlements and monasteries I did enjoy it a lot, but kinda wished they did more with it.\n\nThe story was ok and I really liked what they did with the whole reincarnation of the 'old gods'. The story could have been better presented or at

In [74]:
def build_prompt(query, search_results):
    answer_llm = ""

    if 'entry_template' in globals():  # Check if entry_template exists in global scope
        for doc in search_results:
            answer_llm += entry_template.format(**doc) + "\n\n"
    else:
        for doc in search_results:
            # Fallback formatting if entry_template is missing
            answer_llm += str(doc) + "\n\n"

    prompt = prompt_template.format(question=query, answer_llm=answer_llm).strip()
    return prompt

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query, model='gpt-4o-mini'):
    # search_results = search(query)
    search_results = reader.read_reviews_knn_and_keyword(field=field, query=query, title=title, vector=v_q, num_results=5)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    
    # Return both the query and the answer
    return query, answer

import json
from tqdm import tqdm

evaluations = []

for record in tqdm(ground_truth):
    question = record['question']
    
    # Get LLM answer and the question
    query, answer_llm = rag(question)

    # Format the prompt with the question and LLM answer
    prompt = prompt_template.format(
        question=query,
        answer_llm=answer_llm
    )

    # Get evaluation from LLM
    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    # Append a dictionary with record and evaluation details
    evaluations.append({
        "record": record,
        "evaluation": {
            "query": query,
            "answer_llm": answer_llm,
            "evaluation": evaluation
        }
    })

 67%|██████▋   | 5982/8863 [7:51:57<3:47:18,  4.73s/it] 


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-r5ITp2BvEN054aJmzCxp3udu on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [75]:
evaluations[-1]

{'record': {'document_id': 5981,
  'appid': '2369390',
  'review': {'appid': '2369390',
   'timestamp_query': 1727643597,
   'title': 'Far Cry 6',
   'recommendationid': '174694160',
   'author.steamid': '76561198006612773',
   'author.playtimeforever': None,
   'author.playtime_last_two_weeks': 0,
   'author.playtime_at_review': 3315,
   'author.last_played': 1726043173,
   'language': 'english',
   'review': "Far cry 6 is... interesting, not from a story or game-play perspective. From a game design perspective, the more you play you start to notice how the game was originally supposed to be due to how things are laid out and designed. You can tell the game had a lot of workplace intrigue and politics interfere with the development process. \n\nThe world looks beautiful, its filled with military bases, camps, government buildings, outposts, fortresses, naval yards, air bases, checkpoints all over, yet it deceives the player into thinking you can take them over like in other far cry ga

In [76]:
evaluations[0]['record']['document_id']

0

In [77]:
# Directory to save the JSON file
data_dir = os.path.abspath('../reviews-assistant/data/ground_truth')

# Define the output file path
output_file = os.path.join(data_dir, "ground_truth_evaluation.json")

# Save evaluations as JSON
with open(output_file, 'w') as file:
    json.dump(evaluations, file, indent=4)

print(f"Data has been saved to {output_file}")

Data has been saved to /home/jovyan/reviews-assistant/data/ground_truth/ground_truth_evaluation.json


In [79]:
import pandas as pd
import json

# Assume `evaluations` is a list of dictionaries containing the evaluation data
# Example: evaluations = [ {...}, {...}, ..., {...} ]
# Load your evaluations JSON from file or it might already be defined in your script.
# If it's in a JSON file, you can load it as follows:
# with open('path_to_your_evaluations.json', 'r') as file:
#     evaluations = json.load(file)

# Assuming evaluations is already available as a list of dictionaries:
# Initialize an empty list to hold the records for the DataFrame
records = []

# Iterate through each evaluation entry
for evaluation in evaluations:
    record = evaluation['record']  # Extract the record part
    eval_info = evaluation['evaluation']  # Extract the evaluation part

    # Append a dictionary with the necessary data
    records.append({
        "document_id": record['document_id'],
        "appid": record['appid'],
        "title": record['review']['title'],
        "review": record['review']['review'],
        "voted_up": record['review']['voted_up'],
        "votes_up": record['review']['votes_up'],
        "timestamp_query": record['review']['timestamp_query'],
        "question": record['question'],
        "answer": record['answer'],
        "section": record['section'],
        "evaluation_query": eval_info['query'],
        # "evaluation_answer_llm": json.loads(eval_info['answer_llm']),  # Parse the answer_llm JSON string
        "evaluation_relevance": eval_info['evaluation']['relevance'],
        "evaluation_explanation": eval_info['evaluation']['explanation'],
    })

# Create the DataFrame
df = pd.DataFrame(records)

# Display the DataFrame
# print(df.head())  # Print the first few rows of the DataFrame for verification


In [80]:
df

Unnamed: 0,document_id,appid,title,review,voted_up,votes_up,timestamp_query,question,answer,section,evaluation_query,evaluation_relevance,evaluation_explanation
0,0,2208920,Assassin's Creed Valhalla,After putting in the same amount of hours that...,True,14,1727643597,Is the game combat engaging or repetitive?,"Combat was boring and repetitive, mostly spamm...",game mechanics,Is the game combat engaging or repetitive?,PARTLY_RELEVANT,The generated answer discusses various aspects...
1,1,2208920,Assassin's Creed Valhalla,After putting in the same amount of hours that...,True,14,1727643597,Does the storyline feel cohesive and well-craf...,The story could have been better presented and...,plot,Does the storyline feel cohesive and well-craf...,PARTLY_RELEVANT,The generated answer acknowledges certain aspe...
2,2,2208920,Assassin's Creed Valhalla,After putting in the same amount of hours that...,True,14,1727643597,Are the characters and alliances meaningful th...,Alliances felt underutilized with missed poten...,characters,Are the characters and alliances meaningful th...,PARTLY_RELEVANT,The answer addresses character development and...
3,3,2208920,Assassin's Creed Valhalla,After putting in the same amount of hours that...,True,14,1727643597,Does the game provide a sense of completion up...,Game is fun but has issues that might make pla...,game mechanics,Does the game provide a sense of completion up...,RELEVANT,The generated answer directly responds to the ...
4,4,2208920,Assassin's Creed Valhalla,After putting in the same amount of hours that...,True,14,1727643597,Is it necessary to rely on DLCs for a complete...,DLCs didn't add value and were poorly implemen...,plot,Is it necessary to rely on DLCs for a complete...,NON_RELEVANT,The generated answer does not directly address...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5977,5977,2443720,Concord,Great game in my opinion. I never liked hero s...,True,1,1727643597,Does Concord function well on release day with...,"Yes, players find it well-functioning upon lau...",functionality,Does Concord function well on release day with...,RELEVANT,The generated answer directly responds to the ...
5978,5978,2443720,Concord,Great game in my opinion. I never liked hero s...,True,1,1727643597,Are there any major gameplay mechanics that st...,The gameplay is noted for its engaging mechanics.,game mechanics,Are there any major gameplay mechanics that st...,NON_RELEVANT,The generated answer discusses gameplay mechan...
5979,5979,2443720,Concord,Great game in my opinion. I never liked hero s...,True,1,1727643597,Is Concord worth the purchase price for its co...,Many believe it offers a good value for its pr...,worth,Is Concord worth the purchase price for its co...,NON_RELEVANT,The generated answer discusses 'God of War: Ra...
5980,5980,2369390,Far Cry 6,"Far cry 6 is... interesting, not from a story ...",False,11,1727643597,Is Far Cry 6 a fully completed game at launch?,The game appears to have unfinished elements a...,completion,Is Far Cry 6 a fully completed game at launch?,NON_RELEVANT,The generated answer discusses 'God of War: Ra...


In [81]:
# df_eval = pd.DataFrame(evaluations, columns=['question', 'record', 'answer', 'evaluation'])

# df_eval['appid'] = df_eval.record.apply(lambda d: d['appid'])
# df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

# df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['relevance'])
# df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['explanation'])

# del df_eval['record']
# del df_eval['evaluation']

In [82]:
df.evaluation_relevance.value_counts()

evaluation_relevance
NON_RELEVANT       2329
PARTLY_RELEVANT    2083
RELEVANT           1570
Name: count, dtype: int64

In [83]:
df.evaluation_relevance.value_counts(normalize=True)

evaluation_relevance
NON_RELEVANT       0.389335
PARTLY_RELEVANT    0.348211
RELEVANT           0.262454
Name: proportion, dtype: float64

In [84]:
df[df["evaluation_relevance"]=="RELEVANT"]

Unnamed: 0,document_id,appid,title,review,voted_up,votes_up,timestamp_query,question,answer,section,evaluation_query,evaluation_relevance,evaluation_explanation
3,3,2208920,Assassin's Creed Valhalla,After putting in the same amount of hours that...,True,14,1727643597,Does the game provide a sense of completion up...,Game is fun but has issues that might make pla...,game mechanics,Does the game provide a sense of completion up...,RELEVANT,The generated answer directly responds to the ...
11,11,552520,Far Cry 5,The game does a good job of breaking somewhat ...,True,0,1727643597,How engaging are the main antagonists and thei...,The antagonists are generally one-dimensional ...,plot,How engaging are the main antagonists and thei...,RELEVANT,The generated answer explicitly states that it...
12,12,552520,Far Cry 5,The game does a good job of breaking somewhat ...,True,0,1727643597,Are there any significant technical issues tha...,"Yes, NPC interaction can be disrupted, impacti...",game-breaking bugs,Are there any significant technical issues tha...,RELEVANT,The generated answer directly addresses the qu...
28,28,315210,Suicide Squad: Kill the Justice League,Look I know this game isn't what it should be ...,True,0,1727643597,Are there any major bugs that ruin the experie...,"The review does not mention specific bugs, imp...",game-breaking bugs,Are there any major bugs that ruin the experie...,RELEVANT,The generated answer directly addresses the qu...
29,29,315210,Suicide Squad: Kill the Justice League,Look I know this game isn't what it should be ...,True,0,1727643597,Is the game's price reasonable considering its...,The reviewer feels it's worth it only if disco...,worth,Is the game's price reasonable considering its...,RELEVANT,The generated answer directly addresses the qu...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5958,5958,1817070,Marvel’s Spider-Man Remastered,---{ Graphics }---\r\n ☐ You forget what reali...,True,12,1727643597,What's the story quality like in this game?,The story is lovely and engaging.,plot,What's the story quality like in this game?,RELEVANT,The generated answer effectively assesses the ...
5960,5960,2698940,The Crew Motorfest,In my opinion a very fun game. its like fh5 bu...,True,0,1727643597,Is this game fully functional without crashes ...,I have gotten none of the lag or crashes that ...,functional,Is this game fully functional without crashes ...,RELEVANT,The generated answer directly addresses the qu...
5968,5968,304390,FOR HONOR,the game is really fun the only problem is if ...,True,0,1727643597,Are there any significant bugs affecting gamep...,No major bugs mentioned in the review.,game-breaking bugs,Are there any significant bugs affecting gamep...,RELEVANT,The generated answer explicitly states that th...
5974,5974,552520,Far Cry 5,"★★★★☆\n\n[h2] ""You were the start. You'll be t...",True,0,1727643597,Do I need to rely on external launchers or app...,"Yes, it requires a separate launcher.",hardware,Do I need to rely on external launchers or app...,RELEVANT,The generated answer directly addresses the us...


In [85]:
df[df["evaluation_relevance"]=="NON_RELEVANT"]

Unnamed: 0,document_id,appid,title,review,voted_up,votes_up,timestamp_query,question,answer,section,evaluation_query,evaluation_relevance,evaluation_explanation
4,4,2208920,Assassin's Creed Valhalla,After putting in the same amount of hours that...,True,14,1727643597,Is it necessary to rely on DLCs for a complete...,DLCs didn't add value and were poorly implemen...,plot,Is it necessary to rely on DLCs for a complete...,NON_RELEVANT,The generated answer does not directly address...
5,5,552520,Far Cry 5,This Far Cry offers nothing new. The gameplay ...,False,0,1727643597,What are the gameplay mechanics like in Far Cr...,"The gameplay is stale and repetitive, similar ...",game mechanics,What are the gameplay mechanics like in Far Cr...,NON_RELEVANT,The generated answer does not address the game...
6,6,552520,Far Cry 5,This Far Cry offers nothing new. The gameplay ...,False,0,1727643597,Is the story engaging in Far Cry 5?,The story is bland and unengaging.,plot,Is the story engaging in Far Cry 5?,NON_RELEVANT,The generated answer completely discusses anot...
7,7,552520,Far Cry 5,This Far Cry offers nothing new. The gameplay ...,False,0,1727643597,How would you describe the characters in Far C...,The characters are forgettable.,characters,How would you describe the characters in Far C...,NON_RELEVANT,The generated answer discusses 'God of War: Ra...
8,8,552520,Far Cry 5,This Far Cry offers nothing new. The gameplay ...,False,0,1727643597,Is the open world in Far Cry 5 worth exploring?,"The open world feels lifeless and empty, with ...",location variety,Is the open world in Far Cry 5 worth exploring?,NON_RELEVANT,The generated answer does not address the ques...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5976,5976,2443720,Concord,Great game in my opinion. I never liked hero s...,True,1,1727643597,How does Concord compare to other hero shooter...,It is considered more exciting than Overwatch.,game mechanics,How does Concord compare to other hero shooter...,NON_RELEVANT,The generated answer completely misses the top...
5978,5978,2443720,Concord,Great game in my opinion. I never liked hero s...,True,1,1727643597,Are there any major gameplay mechanics that st...,The gameplay is noted for its engaging mechanics.,game mechanics,Are there any major gameplay mechanics that st...,NON_RELEVANT,The generated answer discusses gameplay mechan...
5979,5979,2443720,Concord,Great game in my opinion. I never liked hero s...,True,1,1727643597,Is Concord worth the purchase price for its co...,Many believe it offers a good value for its pr...,worth,Is Concord worth the purchase price for its co...,NON_RELEVANT,The generated answer discusses 'God of War: Ra...
5980,5980,2369390,Far Cry 6,"Far cry 6 is... interesting, not from a story ...",False,11,1727643597,Is Far Cry 6 a fully completed game at launch?,The game appears to have unfinished elements a...,completion,Is Far Cry 6 a fully completed game at launch?,NON_RELEVANT,The generated answer discusses 'God of War: Ra...


In [86]:
df[df["appid"]=="2322010"]

Unnamed: 0,document_id,appid,title,review,voted_up,votes_up,timestamp_query,question,answer,section,evaluation_query,evaluation_relevance,evaluation_explanation
190,190,2322010,God of War: Ragnarok,Was able to connect my PSN account in about 5 ...,True,5,1727643597,How quickly can I connect my PSN account to th...,In about 5 seconds.,connection,How quickly can I connect my PSN account to th...,RELEVANT,The generated answer provides a specific timef...
191,191,2322010,God of War: Ragnarok,Was able to connect my PSN account in about 5 ...,True,5,1727643597,Does the game offer any major frustrations wit...,No major frustrations mentioned.,functionality,Does the game offer any major frustrations wit...,RELEVANT,The generated answer directly responds to the ...
192,192,2322010,God of War: Ragnarok,Was able to connect my PSN account in about 5 ...,True,5,1727643597,Can I expect a smooth gameplay experience upon...,"Yes, rated 10/10.",gameplay,Can I expect a smooth gameplay experience upon...,RELEVANT,The generated answer directly responds to the ...
193,193,2322010,God of War: Ragnarok,Was able to connect my PSN account in about 5 ...,True,5,1727643597,Is the storyline engaging and well-developed?,"Yes, engaging and well-developed.",plot,Is the storyline engaging and well-developed?,RELEVANT,The generated answer explicitly confirms that ...
194,194,2322010,God of War: Ragnarok,Was able to connect my PSN account in about 5 ...,True,5,1727643597,Is the game worth its price tag considering it...,"Yes, highly recommended.",worth,Is the game worth its price tag considering it...,RELEVANT,The generated answer clearly evaluates the wor...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5875,5875,2322010,God of War: Ragnarok,The game keeps crashing..... I am stuck where ...,False,12,1727643597,Is the game stable enough to play without cras...,"No, the game crashes frequently.",functionality,Is the game stable enough to play without cras...,RELEVANT,The generated answer directly responds to the ...
5876,5876,2322010,God of War: Ragnarok,The game keeps crashing..... I am stuck where ...,False,12,1727643597,How does the game's performance compare on hig...,"It should run fine, but it doesn't for some us...",hardware,How does the game's performance compare on hig...,PARTLY_RELEVANT,The generated answer partially addresses the q...
5877,5877,2322010,God of War: Ragnarok,The game keeps crashing..... I am stuck where ...,False,12,1727643597,What kind of bugs have players experienced dur...,"Players have experienced game crashes, especia...",game-breaking bugs,What kind of bugs have players experienced dur...,PARTLY_RELEVANT,The generated answer mentions a specific type ...
5878,5878,2322010,God of War: Ragnarok,The game keeps crashing..... I am stuck where ...,False,12,1727643597,Are there any significant optimizations for th...,There have been complaints about performance i...,optimizations,Are there any significant optimizations for th...,PARTLY_RELEVANT,The response addresses a related topic of perf...
