In [1]:
2+2

4

In [2]:
# https://partner.steamgames.com/doc/store/getreviews

In [3]:
import sys
import os
import json

In [4]:
# Construct the path to the scripts directory
script_path = os.path.abspath('../reviews-assistant/scripts')

# Add the path to sys.path
if script_path not in sys.path:
    sys.path.append(script_path)

import minsearch

In [5]:
# Directory containing the data files
data_dir = os.path.abspath('../reviews-assistant/data/reviews')

# Initialize an empty list to hold all reviews
reviews = []

# List objects in the directory
objects_in_directory = os.listdir(data_dir)

# Iterate over the files in the directory
for obj in objects_in_directory:
    if obj.endswith('.json'):  # Check if the file is a JSON file
        file_path = os.path.join(data_dir, obj)
        with open(file_path, 'r', encoding='utf-8') as jsonfile:
            # Load the reviews from the JSON file
            file_reviews = json.load(jsonfile)
            reviews.extend(file_reviews)  # Append reviews to the main list
# Print the first i reviews
i = 2  # Change this to print more reviews if needed
for review in reviews[:i]:
    print(f"Author ID: {review['author.steamid']}")
    print(f"Review: {review.get('review', 'No text')}")
    print(f"Timestamp Created: {review['timestamp_created']}")
    print("-" * 79)

Author ID: 76561199211892974
Review: ⠀⠀⠀⠀⠀⠀⠀⠀⠀
[h2] Pros & Cons [/h2]
[hr]
[/hr]
✅ Pros:
[list]
  [*]Beautifully crafted world with great visuals.
  [*]Entertaining quests.
  [*]Good amount of personalization.
  [*]Variety of things to do.
  [*]Great setting.
[/list]

❌ Cons:
[list]
  [*]Yet another Ubisoft game with fillers above fillers. Quantity > Quality.
  [*]The game overall is too longs. With such an amount of things to do, it is easy to get distracted. It is a personal view as someone might find this as a positive, but after so many hours I just lost interest to move forward.
  [*]The combats is getting repetitive after a while. The skill tree even if it is big, lacks skills that will impact the combat.
  [*]The base building although interesting in the beginning, does not bring much value to your characters. It is an add-on that after a while you might forget it even exist.
[/list]

[h2] Verdict[/h2]
[hr]
[/hr]
Even though I recommend this game, it is easy to notice the amount

In [6]:
len(reviews)

773

In [7]:
index = minsearch.Index(
    # text_fields=["author.steamid", "author.playtimeforever", "author.playtime_last_two_weeks", "author.playtime_at_review", "author.last_played", "language", "review", "timestamp_created", "timestamp_updated"],
    text_fields=["title", "language", "review"],
    keyword_fields=["appid", "recommendationid"]
)

index.fit(reviews)

<minsearch.Index at 0x7fc8fc435420>

# Retrieval evaluation

In [8]:


# Directory containing the ground truth file
data_dir = os.path.abspath('../reviews-assistant/data/ground_truth')

# Path to the ground_truth_retrieval.json file
file_path = os.path.join(data_dir, 'ground_truth_retrieval.json')

# Check if the file exists
if os.path.exists(file_path):
    try:
        # Open and load the JSON file
        with open(file_path, 'r', encoding='utf-8') as jsonfile:
            ground_truth_data = json.load(jsonfile)  # Load the JSON data into a Python object

        # Ensure each item is a dictionary and contains 'id' and 'question'
        if all(isinstance(item, dict) and 'appid' in item and 'question' in item for item in ground_truth_data):
            print("Data successfully loaded and is in the correct format.")

            # Example: Print a sample of the data
            for item in ground_truth_data[:5]:  # Print the first 5 questions
                print(f"ID: {item['appid']}, Question: {item['question']}")
        else:
            print("Error: The data format is incorrect or missing required fields ('id', 'question').")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
else:
    print(f"File {file_path} does not exist.")


Data successfully loaded and is in the correct format.
ID: 315210, Question: Is 'Suicide Squad: Kill the Justice League' suitable for my child given its negative controls and gameplay issues?
ID: 315210, Question: Can I trust the reviews about this game given the mention of needing a refund?
ID: 315210, Question: What age rating does this game have, and does it align with what I want my child to play?
ID: 315210, Question: Are there any themes in this game that promote agenda-driven narratives that I should be aware of?
ID: 315210, Question: How does this game compare to other titles like 'Gotham Knights' and 'Marvel's Avengers' in terms of gameplay and suitability for children?


In [9]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [10]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['appid']
        results = search_function(q)
        relevance = [d['appid'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [12]:
from tqdm.auto import tqdm

In [13]:
evaluate(ground_truth_data, lambda q: minsearch_search(q['question']))

  0%|          | 0/45 [00:00<?, ?it/s]

{'hit_rate': 0.7111111111111111, 'mrr': 2.024611992945328}

# Finding the best parameters

In [25]:
len(ground_truth_data)

45

In [26]:
df_validation = ground_truth_data[:30]
df_test = ground_truth_data[15:]

In [29]:
df_validation[-1]

{'appid': '1086940',
 'question': "Can you provide insights into the community's perception of Baldur's Gate 3 in terms of its adherence to traditional gaming values?"}

In [30]:
df_test[-1]

{'appid': '1680880',
 'question': "How might the game's writing and character representation impact its suitability for my child?"}

In [31]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [32]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [33]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(df_validation, search_function)
    return results['mrr']

In [34]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

({'exercise_name': 0.6736240252805771,
  'type_of_activity': 1.7006633621565135,
  'type_of_equipment': 0.6327765863786196,
  'body_part': 1.692151876047895,
  'type': 2.1767103834520816,
  'muscle_groups_activated': 1.9495515830509804,
  'instructions': 1.0392632375680444},
 1.962962962962966)

In [36]:
def minsearch_improved(query):
    boost = {'exercise_name': 0.6736240252805771,
  'type_of_activity': 1.7006633621565135,
  'type_of_equipment': 0.6327765863786196,
  'body_part': 1.692151876047895,
  'type': 2.1767103834520816,
  'muscle_groups_activated': 1.9495515830509804,
  'instructions': 1.0392632375680444}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth_data, lambda q: minsearch_improved(q['question']))

  0%|          | 0/45 [00:00<?, ?it/s]

{'hit_rate': 0.7111111111111111, 'mrr': 2.024611992945328}

# RAG evaluation