In [4]:
!pip install --quiet transformers bitsandbytes 

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
%%writefile test.py

import random
import torch
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

GAME_WORDS = [
    "cat", "dog", "cow", "horse", "rabbit", "lion", "bear", "shark", "eagle", "ant",
    "apple", "banana", "orange", "carrot", "bread", "cheese", "pizza", "cookie", "egg", "ice-cream",
    "chair", "table", "sofa", "bed", "lamp", "clock", "mirror", "door", "window", "carpet",
    "car", "bicycle", "bus", "train", "airplane", "boat", "rocket", "helmet", "engine", "wheel",
    "pencil", "pen", "book", "paper", "scissors", "ruler", "eraser", "backpack", "laptop", "phone",
    "ball", "doll", "puzzle", "kite", "yo-yo", "drum", "guitar", "camera", "radio", "television",
    "shirt", "pants", "jacket", "hat", "shoes", "gloves", "umbrella", "watch", "glasses",
    "moon", "sun", "star", "cloud", "rain", "snow", "mountain", "river", "ocean", "island",
    "doctor", "teacher", "chef", "farmer", "artist", "pilot", "police", "firefighter", "singer", "dancer",
    "gold", "silver", "iron", "sand", "water", "oil", "soap", "sugar", "salt", "honey"
]

SYSTEM_PROMPT = "You are a precise answering engine. Based on the keyword and question, provide a 'Yes' or 'No' answer."
FEW_SHOT_EXAMPLES = """
[EXAMPLE 1]
Keyword: car
Question: Is it a living thing?
Answer: No
[EXAMPLE 2]
Keyword: water
Question: Is it used for cleaning?
Answer: Yes
[EXAMPLE 3]
Keyword: tree
Question: Is it man-made?
Answer: No
"""

class ValidatorModel:
    
    _model = None
    _tokenizer = None
    _words_dataset = None
    
    def __init__(self):

                
        if ValidatorModel._model is None: # Check if the model has already been loaded (by a previous instantiation)
            print("--- Loading model and tokenizer first time ---")
            login("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", add_to_git_credential=False)
            model_id = "mistralai/Mistral-7B-Instruct-v0.3"

            # Load and assign to the CLASS attributes
            ValidatorModel._tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
            
            bnb_cfg = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16,
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_quant_type="nf4")

            ValidatorModel._model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                quantization_config=bnb_cfg,
                trust_remote_code=True,
            ).eval()
            ValidatorModel._words_dataset = GAME_WORDS.copy()
            print("--- Model and tokenizer loaded and cached. ---")
        
        # Assign the cached model/tokenizer to this specific instance
        self.model = ValidatorModel._model
        self.tokenizer = ValidatorModel._tokenizer
        self.words_dataset = ValidatorModel._words_dataset
        
        self.keyword = random.choice(self.words_dataset)
        random.choice(self.words_dataset)
        print(f"\n--- Validator secret word: {self.keyword} ---")
        self.words_dataset.remove(self.keyword) # to not ask same word twice

    def _ask_ai(self, messages: List[Dict], max_new=8, temp=0.01) -> str:
        prompt = self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        out = self.model.generate(
            **inputs,
            max_new_tokens=max_new,
            temperature=temp,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id
        )
        return self.tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()

    def validate_question(self, question: str) -> str:
        user_content = f"{FEW_SHOT_EXAMPLES}\n[FINAL TASK]\nKeyword: {self.keyword}\nQuestion: {question}\nAnswer:"
        messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_content}]
        model_ans = self._ask_ai(messages)
        return "yes" if "yes" in model_ans.lower() else "no"

    def validate_guess(self, guess: str) -> str:
        return 'Yes' if guess and self.keyword and guess.lower() == self.keyword.lower() else 'No'

Overwriting test.py


In [6]:
%%writefile evaluate_20Q.py

from datasets import load_dataset
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import logging
import os
import argparse
logging.getLogger("transformers").setLevel(logging.ERROR)
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

Threshold = 1e-3

class Guesser:

    def __init__(self, p_yes_table: pd.DataFrame, final_nouns: pd.DataFrame):

        self.keywords = final_nouns['word'].values
        self.questions = p_yes_table.columns.drop('word').values
        self.question_to_idx = {q: i for i, q in enumerate(self.questions)}

        # Priors P(k)
        self.priors = final_nouns['frequency_nouns'].values
        self.priors /= self.priors.sum()

        # Probability table P(q=z | k, history)
        self.prob_table = p_yes_table[self.questions].values.astype(np.float32)

        self.num_keywords = len(self.keywords)
        self.num_questions = len(self.questions)

    def choose_best_question(self, current_probs, asked_questions, Monitor_Question=False, question_list=None):

        min_expected_entropy = float('inf')
        best_question = None

        available_questions = [q for q in self.questions if q not in asked_questions]

        for q_text in available_questions:
            q_idx = self.question_to_idx[q_text]

            # P(q=yes|k) for the current question
            p_yes_k = self.prob_table[:, q_idx]

            # P(q=z|history) = Σ P(q=z|k) * P(k|history) # First term in equation (1)
            p_yes_q = np.sum(p_yes_k * current_probs) # P(q=yes|history)
            p_no_q = 1.0 - p_yes_q # P(q=no|history)

            # This is a cery important step specially for our limited resources.
            # To limit the search space in question 1, we initially discard the questions that Skip questions that P(q=z|history) < Threshold
            # These questions are pretty obvious that are not good candidates and don't split the possibilities.
            # We can't put Threshold near 0.5 as our questions are limited
            if p_yes_q < Threshold or p_no_q < Threshold:
                continue

            # Calculating posterior probabilities
            # P(k|history, q=z) = P(q=z|k) * P(k|history) / P(q=z) # terms in second sum in equation (1)
            posterior_yes = (p_yes_k * current_probs) / p_yes_q
            posterior_no = ((1 - p_yes_k) * current_probs) / p_no_q

            # --- Calculating second sum in equation (1) ---
            entropy_yes = -np.sum(posterior_yes[posterior_yes > 0] * np.log2(posterior_yes[posterior_yes > 0]))
            entropy_no = -np.sum(posterior_no[posterior_no > 0] * np.log2(posterior_no[posterior_no > 0]))

            # Expected entropy
            expected_entropy = p_yes_q * entropy_yes + p_no_q * entropy_no
            # if q_text == "Is it a living thing?" or q_text == "Is it a man-made object?" or q_text == "Is it used for cooking?":
            if Monitor_Question:
                if q_text in question_list:
                    print(f"\n=Expected Entropy for '{q_text}' = {expected_entropy}\n")
                
            if expected_entropy < min_expected_entropy:
                min_expected_entropy = expected_entropy
                best_question = q_text
                
        if Monitor_Question:
            print(f"best min_expected_entropy so far: {min_expected_entropy}")
        return best_question

    def update_probabilities(self, current_probs, question, answer: str):
      # Calculating Posteriers after getting answers: # P(k|new_history) = P(answer|k) * P(k|old_history) / P(answer)

        q_idx = self.question_to_idx[question]
        p_yes_k = self.prob_table[:, q_idx]
        if answer.lower() == 'yes':
            likelihood = p_yes_k
        elif answer.lower() == 'no':
            likelihood = 1 - p_yes_k
        else:
            raise ValueError("Answer must be 'yes' or 'no'")

        new_probs = current_probs * likelihood # P(answer|k) * P(k|old_history)

        normalization_factor = np.sum(new_probs) # P(answer)
        if normalization_factor > 0:
            new_probs /= normalization_factor

        return new_probs

    def get_top_guesses(self, current_probs, top_n=5, Monitor_Word=False, word_idx=None):
        # top_n most likely keywords.

        # Get indices of the top N probabilities in descending order
        top_indices = np.argsort(current_probs)[::-1][:top_n]
        if Monitor_Word:
          print(f"\nMonitoring word: {self.keywords[word_idx]}, current probability: {current_probs[word_idx]}")
        return {self.keywords[i]: current_probs[i] for i in top_indices}



def play_entropy_game(p_table, nouns, manual=True, num_games=10, secret_word_manual=None):
    guesser = Guesser(p_table, nouns)
    validator = None
    wins = 0
    
    for game_num in range(num_games if not manual else 1): # runs once for manual mode, or N times for automatic
        current_probabilities = guesser.priors.copy()
        asked_questions = set()
        game_won = False
        
        if manual:
            word_idx = nouns.loc[nouns['word'] == secret_word_manual].index if secret_word_manual else None
        else:
            try:
                from test import ValidatorModel
                validator = ValidatorModel()
            except:
                print("Can't get ValidatorModel")
            print(f"--- Game {game_num + 1}/{num_games} ---")

        for turn in range(1, 21):
            question_to_ask = guesser.choose_best_question(current_probabilities, asked_questions)
            if not question_to_ask:
                print("No more informative questions.")
                break
                
            if manual:
                answer = input(f"{question_to_ask} (yes/no): ")
            else:
                answer = validator.validate_question(question_to_ask)

            answer = 'yes' if answer.strip().lower() in {"yes", "true", "correct"} else 'no'
            print(f"Turn {turn}: Agent: {question_to_ask} Validator: {answer}")
            current_probabilities = guesser.update_probabilities(current_probabilities, question_to_ask, answer)
            asked_questions.add(question_to_ask)

            top_guesses = guesser.get_top_guesses(
                current_probabilities, 
                Monitor_Word=(manual and secret_word_manual is not None), 
                word_idx=word_idx if manual else None
            )
            top_guess_word = list(top_guesses.keys())[0]
            print(f"Agent's guess is: '{top_guess_word}'")
            
            if manual:
                print("Top 5 Probabilities:", {word: f"{prob:.2%}" for word, prob in top_guesses.items()})

            if manual:
                is_correct = input("Was this guess correct? (yes/no): ")
            else:
                is_correct = validator.validate_guess(top_guess_word)

            is_correct = is_correct.strip().lower() in {"yes", "true", "correct"} 
            
            if is_correct:
                game_won = True
                if not manual:
                    wins += 1
                print(f"Result: Agent WON in {turn} turns.")
                break
            else: 
                # If the guess is wrong, eliminating that word from possibilities.
                idx_to_remove = np.where(guesser.keywords == top_guess_word)[0]
                if len(idx_to_remove) > 0:
                    current_probabilities[idx_to_remove[0]] = 0 # Set its probability to zero
                    if current_probabilities.sum() > 0: # Re-normalize the probabilities so they sum to 1 again
                        current_probabilities /= current_probabilities.sum()

        if not game_won:
            print(f"Result: Agent LOST.")

    if not manual:
        print("\n--- Evaluation Finished ---")
        print(f"Final Score: {wins} / {num_games} wins ({wins/num_games:.2%})")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Evaluate a 20 Questions agent.")
    parser.add_argument(
        "-N",
        "--num_games",
        type=int,
        required=True,
        help="The number of games to play for the evaluation."
    )
    args = parser.parse_args()
    
    repo_id = "shahriar7/20_questions_game_QA"
    
    semantic_data = load_dataset(repo_id, name='semantic_questions')
    semantic_p_yes_table = semantic_data['train'].to_pandas()
    
    nouns_data = load_dataset(repo_id, name='final_nouns')
    final_nouns = nouns_data['train'].to_pandas()
    
    play_entropy_game(semantic_p_yes_table, final_nouns, manual=False, num_games=args.num_games)

Overwriting evaluate_20Q.py


In [8]:
!python evaluate_20Q.py -N 50

--- Loading model and tokenizer first time ---
2025-08-27 00:30:36.620808: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756254636.648255     204 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756254636.656606     204 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:51<00:00, 17.15s/it]
--- Model and tokenizer loaded and cached. ---

--- Validator secret word: oil ---
--- Game 1/50 ---
Turn 1: Agent: Is it a man-made object? Validator: yes
Agent's guess is: 'home'
Turn 2: Agent: Is it portable? Validator: no
Agent's guess is: 'page'
Turn 3: Agent: Is it an abstract concept? Validator: no
Ag