# Installing modules

In [None]:
# 🔧 1) install (همون خط قبلی)
!pip install --quiet transformers accelerate bitsandbytes sentence-transformers #faiss-cpu
!pip install faiss-gpu-cu12

# Section 1 : Chat bot core

In [None]:
# hugging face token
# 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

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch, gc, os
from huggingface_hub import login

login("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")

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

bnb_cfg = BitsAndBytesConfig(load_in_4bit=True,
                             bnb_4bit_compute_dtype=torch.float16,
                             bnb_4bit_use_double_quant=True,
                             bnb_4bit_quant_type="nf4")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_cfg,
    trust_remote_code=True,
).eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
import json, gc, os, time
from collections import deque
from typing import List, Dict

# ------------ tweakables -----------------------------------------------------
SYSTEM_PROMPT = "You are a helpful AI assistant."
MAX_CTX_TOKENS   = 8000 - 512        # keep 512 tokens headroom
SUMMARISE_AT_TOK = 6000              # start summarising above this
CHUNK_SIZE       = 12                # summarise 12 oldest turns each time
LOG_FILE         = "chatlog.jsonl"   # optional disk log
# -----------------------------------------------------------------------------

def num_tokens(text: str) -> int:
    # helper for quick token counting
    return len(tokenizer.encode(text))

def chat_completion(messages: List[Dict],  # messages[-1] must be user
                    max_new=256, temp=0.7, top_p=0.9):
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new,
        temperature=temp,
        top_p=top_p,
        eos_token_id=tokenizer.eos_token_id
    )
    reply = tokenizer.decode(out[0][inputs.input_ids.shape[1]:],
                             skip_special_tokens=True).strip()
    return reply


class MemoryChatbot:
    """Keeps the last N turns verbatim and auto-summarises earlier ones."""

    def __init__(self,
                 system_prompt: str = SYSTEM_PROMPT,
                 max_ctx_tokens: int = MAX_CTX_TOKENS,
                 summarise_at: int = SUMMARISE_AT_TOK,
                 chunk_size: int = CHUNK_SIZE):
        self.system_prompt = system_prompt
        self.max_ctx_tokens = max_ctx_tokens
        self.summarise_at   = summarise_at
        self.chunk_size     = chunk_size

        self.history = deque()    # list of {"role":..., "content":...}
        self.memo    = ""         # running summary of trimmed turns

    # ------------- public API -------------------------------------------------
    def ask(self, user_msg: str) -> str:
        """Main entry: add user message → maybe summarise → get reply."""
        self._append("user", user_msg)
        self._maybe_summarise()
        reply = self._generate_reply(user_msg)
        self._append("assistant", reply)
        return reply
    # -------------------------------------------------------------------------

    # ------------- internal helpers ------------------------------------------
    def _append(self, role, content):
        self.history.append({"role": role, "content": content})
        self._disk_log(role, content)

    def _current_messages(self) -> List[Dict]:
        msgs = [{"role": "system", "content": self.system_prompt}]
        if self.memo:
            msgs.append({"role": "assistant",
                         "content": f"[CONTEXT SUMMARY]\n{self.memo}"})
        msgs.extend(self.history)
        return msgs

    def _prompt_tokens(self) -> int:
        txt = tokenizer.apply_chat_template(self._current_messages(),
                                            tokenize=False)
        return num_tokens(txt)

    def _maybe_summarise(self):
        """If conversation is getting heavy, summarise oldest chunk."""
        while self._prompt_tokens() > self.summarise_at and len(self.history) > self.chunk_size:
            chunk = list(self.history)[:self.chunk_size]
            chunk_txt = "\n".join(f"{m['role']}: {m['content']}" for m in chunk)

            summary_prompt = [
                {"role": "system",
                 "content": "You are a summarisation assistant."},
                {"role": "user",
                 "content":
                 ("Summarise the following conversation in ≤8 bullet points, "
                  "preserve all factual details:\n\n" + chunk_txt)}
            ]
            summary = chat_completion(summary_prompt, max_new=160, temp=0.3)

            # remove chunk & prepend summary
            for _ in range(self.chunk_size):
                self.history.popleft()
            self.memo = (self.memo + "\n" + summary).strip()

            # free GPU RAM
            gc.collect(); torch.cuda.empty_cache()

            if self._prompt_tokens() < self.max_ctx_tokens:
                break

    def _generate_reply(self, user_msg):
        msgs = self._current_messages()
        reply = chat_completion(msgs)
        gc.collect(); torch.cuda.empty_cache()
        return reply

    def _disk_log(self, role, content):
        if LOG_FILE:
            with open(LOG_FILE, "a", encoding="utf-8") as f:
                json.dump({"ts": time.time(), "role": role,
                           "content": content}, f, ensure_ascii=False)
                f.write("\n")
    # -------------------------------------------------------------------------


In [20]:
# # ----------------------- quick demo ------------------------------------------
# bot = MemoryChatbot()

# qs = ["Hey there! How are you?",
#       "Can you suggest two contemporary architecture books?",
#       "What chapters do those books include?"]

# for q in qs:
#     print("👤", q)
#     print("🤖", bot.ask(q), "\n")

# # keep chatting … the bot will start summarising automatically

# Section 2: RAG

In [22]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import re   # ← add near the other imports
# ─────────────────────────────────────────────────────────────────────────────
# 1.  Install (once per session)
# ─────────────────────────────────────────────────────────────────────────────
!pip install -q "pymupdf>=1.22" faiss-cpu sentence-transformers

# ─────────────────────────────────────────────────────────────────────────────
# 2.  Build / load the FAISS index
#     • Scans every *.pdf in /kaggle/input/pdf-folder
#     • Extracts text with PyMuPDF
#     • Splits it into ≈700‑character chunks
#     • Embeds chunks with sentence‑transformers/all‑MiniLM‑L6‑v2
#     • Saves index + metadata to /kaggle/working for reuse
# ─────────────────────────────────────────────────────────────────────────────
import os, glob, json, itertools, math, pathlib
import fitz                           # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

DATA_DIR   = "/kaggle/input/pdf-folder"
INDEX_F    = "/kaggle/working/rag.index"
META_F     = "/kaggle/working/chunks.json"
CHUNK_SIZE = 700          # characters, ≈100 words

def extract_text(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def chunk_text(text: str, size: int = 700):
    for start in range(0, len(text), size):
        yield text[start : start + size]

def clean(text: str) -> str:
    """
    Normalises the text so embeddings are not polluted by layout artefacts.

    • replaces hard line‑breaks (\n, \r) with a single space
    • keeps only ASCII letters, digits, and whitespace
    • collapses 2+ whitespace chars into one space
    """
    text = text.replace("\n", " ").replace("\r", " ")
    return text    
    # text = re.sub(r"[^A-Za-z0-9\s]+", " ", text)   # strip punctuation + accents
    # return re.sub(r"\s{2,}", " ", text).strip()


def build_index():
    pdf_files = sorted(glob.glob(os.path.join(DATA_DIR, "*.pdf")))
    if not pdf_files:
        raise FileNotFoundError(f"No PDFs detected in {DATA_DIR}")

    chunks, meta = [], []
    for path in pdf_files:
        raw_text = extract_text(path)               # <-- original extractor
        raw_text = clean(raw_text)                  # <-- NEW: sanitise once
        for i, chunk in enumerate(chunk_text(raw_text, CHUNK_SIZE)):
            meta.append({"source": os.path.basename(path), "chunk_id": i, "text": chunk})
            chunks.append(chunk)

    print(f"✓ Extracted {len(chunks)} chunks from {len(pdf_files)} file(s).")

    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
    vecs  = embed_model.encode(chunks, batch_size=32, show_progress_bar=True).astype("float32")

    index = faiss.IndexFlatL2(vecs.shape[1])
    index.add(vecs)

    faiss.write_index(index, INDEX_F)
    with open(META_F, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False)

    print(f"✓ Index saved to {INDEX_F}; metadata to {META_F}")

# Build only if we have not done so already
if not (pathlib.Path(INDEX_F).exists() and pathlib.Path(META_F).exists()):
    build_index()
else:
    print("Index already present – skipping rebuild.")
# build_index()
# ─────────────────────────────────────────────────────────────────────────────
# 3.  Lightweight retriever class (Section 2)
# ─────────────────────────────────────────────────────────────────────────────
class Retriever:
    """
    • Filters out FAISS ‘empty‑slot’ returns (id == ‑1, distance == FLT_MAX)
    • Converts L2 distance to cosine‑like similarity in [0, 1]
    """
    def __init__(self, idx_path="/kaggle/working/rag.index",
                       meta_path="/kaggle/working/chunks.json"):
        self.model  = SentenceTransformer("all-MiniLM-L6-v2")
        self.index  = faiss.read_index(idx_path)
        with open(meta_path, encoding="utf-8") as f:
            self.meta = json.load(f)
        # pre‑compute norms once for the conversion formula
        self._vec_norm = np.linalg.norm(
            self.index.reconstruct(0) ).astype("float32")  # all vectors same length

    def _l2_to_similarity(self, l2: float) -> float:
        # cosine_sim = 1 - (L2_dist²) / (2 * |a|²)   for unit‑length queries ≈ 1
        return max(0.0, 1.0 - l2 / (2 * self._vec_norm**2))

    def top_k(self, query: str, k: int = 3):
        q_vec  = self.model.encode([query]).astype("float32")
        D, I   = self.index.search(q_vec, k)
        hits   = []
        for rank, (idx, dist) in enumerate(zip(I[0], D[0])):
            if idx == -1 or np.isinf(dist) or dist > 1e8:
                continue                          # FAISS padding → skip
            hits.append({
                "rank"     : rank + 1,
                "source"   : self.meta[idx]["source"],
                "chunk_id" : self.meta[idx]["chunk_id"],
                "similarity": round(self._l2_to_similarity(dist), 3),
                "text"     : self.meta[idx]["text"].strip()
            })
        return hits


✓ Extracted 3 chunks from 1 file(s).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✓ Index saved to /kaggle/working/rag.index; metadata to /kaggle/working/chunks.json


In [23]:
# retriever = Retriever()
# for h in retriever.top_k("What is A + B + C?", k=3):
#     print(f"[{h['rank']}] sim={h['similarity']:.3f} • {h['source']} chunk {h['chunk_id']}\n{h['text']}\n")


In [24]:
from pathlib import Path

# make sure the index was built
assert Path("/kaggle/working/rag.index").exists(), "Run the build‑index cell first."

retriever = Retriever(                       # ← class from Section 2
    idx_path = "/kaggle/working/rag.index",
    meta_path = "/kaggle/working/chunks.json"
)



In [25]:
class RAGMemoryChatbot(MemoryChatbot):
    """
    Adds Retrieval‑Augmented Generation (RAG) on top of the Section 1 bot.
    History handling, summarisation, and disk logging remain untouched.
    """
    def __init__(self, retriever: Retriever, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.retriever = retriever
        
    def ask(self, user_msg: str, k: int = 3) -> str:
        # 1) keep the normal bookkeeping
        self._append("user", user_msg)
        self._maybe_summarise()

        # 2) fetch top‑k supporting passages (if any)
        rag_block = ""
        if self.retriever is not None:
            hits = self.retriever.top_k(user_msg, k=k)
            if hits:
                rag_block = ("Relevant background:" .join(f"[doc] {h['text']}" for h in hits))
                # rag_block = ("\n\n---\nRelevant background:\n" +
                #              "\n\n".join(f"[Doc] {h['text']}" for h in hits) +
                #              "\n---")

        # 3) build the prompt stack with the extra context
        messages = self._current_messages()             # system + history
        messages[0]["content"] += rag_block             # add to system prompt
        
        # 4) call exactly the same LLM wrapper you already use
        reply = chat_completion(messages,temp = 0.2)
        
        # 5) store assistant reply and continue as before
        self._append("assistant", reply)
        return reply
        
    # def ask(self, user_msg: str, k: int = 3) -> str:
    #     """Main entry: add user message → maybe summarise → get reply."""
    #     rag_block = ""
    #     if self.retriever is not None:
    #         hits = self.retriever.top_k(user_msg, k=k)
    #         if hits:
    #             rag_block = ("Relevant background:" .join(f"{h['text']}" for h in hits))
    #             # rag_block = ("\n\n---\nRelevant background:\n" +
    #             #              "\n\n".join(f"[Doc] {h['text']}" for h in hits) +
    #             #              "\n---")
    #     # print(rag_block)
    #     # print(self._current_messages())
        
    #     # messages = self._current_messages()             # system + history
    #     # messages[0]["content"] += rag_block             # add to system prompt
        
    #     self._append("user", rag_block)
    #     self._append("user", user_msg)
    #     # print(self._current_messages())
    #     self._maybe_summarise()
        
    #     reply = self._generate_reply(user_msg)
    #     self._append("assistant", reply)
    #     return reply
    # ──────────────────────────────────────────────────────────────────

# bot = RAGMemoryChatbot(retriever = retriever)

In [26]:
# question = "There are some information in doc about amount of A,B and C and it is not related to cat. According to this can you give me the result of (A+B)"
# question = "what is information about persian cat in doc?"
# answer   = bot.ask(question)            # same public method as before
# print(answer)

# Section 3 : Function call

Part 3.1

In [27]:
!pip install exa-py
!pip install openai
!pip install python-dotenv
%env EXA_API_KEY=af19a97b-45de-4ab4-8344-7029c5b7e7d6

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


env: EXA_API_KEY=af19a97b-45de-4ab4-8344-7029c5b7e7d6


In [28]:
# ─────────────────────────────────────────────────────────────────────────────
# Section 3 – Web-augmented RAG + Memory
# ─────────────────────────────────────────────────────────────────────────────
import requests, os, textwrap, gc, torch
from exa_py import Exa

EXA_ENDPOINT = "https://api.exa.ai/search"  

class WebRAGMemoryChatbot(RAGMemoryChatbot):
    """
    RAG + Memory + web trigger (Part 3.1)
    --------------------------------------------------------------------------
    • inherits history, summarisation, and local-PDF RAG from Section 2
    • adds:
        – _needs_web_search   → yes/no (with LLM)
        – _exa_search         → 3 hits  (Exa API)
        – _summarise_hits     → ≤120-word digest
    """
    # ---------- init ---------------------------------------------------------
    def __init__(self, exa_api_key: str, *args, **kwargs):
        super().__init__(*args, **kwargs)         # from RAGMemoryChatbot
        self.exa_api_key = exa_api_key
        self.exa = Exa(exa_api_key)   
    # ---------- helpers ------------------------------------------------------        
    def _needs_web_search(self, user_msg: str) -> bool:
        """
        Upgraded classifier to determine if a web search is necessary.
        """
        
        # The new, more descriptive prompt with criteria and examples
        prompt = f'''
        You are an expert classifier. Your task is to determine if a user's question requires a real-time web search.
        Reply with a single word: YES or NO.
        
        ## CRITERIA
        You must answer YES if the question asks for:
        - Current events or news (e.g., "what happened in France today?")
        - Real-time information (e.g., "what's the price of gold?", "what's the weather in London?")
        - Information about a very recent topic or public figure.
        
        You must answer NO if the question is about:
        - General knowledge (e.g., "what is the capital of Japan?")
        - Math, logic, or creative writing.
        - Information contained within provided documents.
        - A greeting or a question about your identity.
        
        ## EXAMPLES
        User: What is the weather like in Tehran today?
        Assistant: YES
        
        User: Can you tell me a story about a dragon?
        Assistant: NO
        
        User: what news are about Syria today?
        Assistant: YES
        
        User: what is information about persian cat in doc?
        Assistant: NO
        
        ## TASK
        Now, classify the following user question. Remember to only reply with YES or NO.
        
        User: {user_msg}
        Assistant:
        '''
    
        messages = [
            # The detailed instructions are now in the user message for better focus
            {"role": "user", "content": prompt}
        ]
        # Increase max_new_tokens to safely generate "yes" or "no"
        # Lower temperature for more stable, deterministic classification
        out = chat_completion(messages, max_new=4, temp=0.1).strip().lower()
        # Check for a clean "yes"
        return out.startswith("yes")

    def _exa_search(self, query: str, k: int = 3):
        """
        Uses exa_py.search_and_contents → one call does both search + content
        Returns a list[{title,url,summary}] for downstream summarisation.
        """
        if not self.exa_api_key:
            raise ValueError("❌ EXA_API_KEY is empty or not set.")
    
        # call the SDK – we want full page text, not just metadata
        response = self.exa.search_and_contents(
            query,
            text=True,                # full text of each result
            num_results=k,             # ← snake_case in SDK
            # contex = True,
            summary = True
        )                              # :contentReference[oaicite:0]{index=0}
        hits = []
        for r in response.results:     # ResultWithText objects
            hits.append({
                "title":   r.title,
                "url":     r.url,
                "text": (r.text or "")#[:600]      # first 400 chars
                ,"summary": r.summary
            })
        return hits


    def _summarise_hits(self, hits) -> str:
        joined = "\n".join(f"• {h['title']}: {h['summary']}" for h in hits)
        prompt = [
            {"role": "system",
             "content": "Summarise the following web snippets in ≤120 words."},
            {"role": "user", "content": joined}
        ]
        return chat_completion(prompt, max_new=160, temp=0.3).strip()


    # ---------- public API ---------------------------------------------------
    def ask(self, user_msg: str, k: int = 3) -> str:
        # 1) bookkeeping + maybe summarise
        self._append("user", user_msg)
        self._maybe_summarise()
        # 2) RAG (PDF)
        rag_block = ""
        if self.retriever is not None:
            hits = self.retriever.top_k(user_msg, k=k)
            if hits:
                rag_block = ("\n\n---\n[Local Docs]\n" +
                             "\n\n".join(f"{h['text']}" for h in hits) +
                             "\n---")

        # 3) Web-Trigger
        web_block = ""
        summaries = ""
        if self._needs_web_search(user_msg):
            try:
                web_hits = self._exa_search(user_msg, k=3)
                summary  = self._summarise_hits(web_hits)
                web_block = f"\n\n---\n[Web Info]\n{summary}\n---"
                print("Web search done!\n")

                for h in web_hits:
                    summaries+= h["summary"] 
            except Exception as e:
                print("⚠️  Web search failed:", e)
        

        # 4) build full prompt & get answer
        messages = self._current_messages()        # system + history (+memo)
        messages[0]["content"] += " Just Use Local Docs information when user asks about it.\n " + rag_block + web_block #+ "\nGive user the latest and newst information."
        
        reply = chat_completion(messages,max_new=512, temp=0.5)

        # 5) save reply
        self._append("assistant", reply)
        return reply


In [29]:
bot = WebRAGMemoryChatbot(
        exa_api_key=os.getenv("EXA_API_KEY"),
        retriever=retriever,
        system_prompt=SYSTEM_PROMPT
)
# q = "What is the weather like in Tehran today?"
# q = "What date is it today in tehran?"
# q = "what news are about Syria today?"
q = "What is the price of gold in dollars today?"
# q = "Who is the president of Iran right now?"
# q = "What new news is about Thailand?"
# q = "What is (a*b)+d - s if a = 1,b = 2,d = 4, s = 10"
answer = bot.ask(q)
print(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Web search done!

Based on the Local Docs information, there is no specific price for gold in dollars provided in the given data. However, according to the Web Info, the spot price of gold is currently $2,329.30 USD per ounce.


Part 3.2

In [30]:
# ─────────────────────────────────────────────────────────────────────────────
# Game-aware Web-RAG + Memory Chatbot  (20-Questions, EN-only)
# ─────────────────────────────────────────────────────────────────────────────
import re, json, textwrap, random
from exa_py import Exa        # already used in WebRAGMemoryChatbot
import re

QUESTION_FALLBACKS = [
    "Is it man-made?", "Is it bigger than a loaf of bread?",
    "Is it commonly found indoors?", "Is it electronic?"
]

GUESS_FALLBACKS = ["cat", "dog", "tree", "car", "phone", "apple", "lion"]

BAD_GUESSES = {"", "thing", "object", "based"}

class GameWebRAGMemoryChatbot(WebRAGMemoryChatbot):
    # --------------------------------------------------------------------- #
    # 0. constructor & single-point game-state init
    # --------------------------------------------------------------------- #
    def __init__(self, exa_api_key: str, *args, **kwargs):
        super().__init__(exa_api_key, *args, **kwargs)
        self._init_game_state()

    def _init_game_state(self):
        self.game_active     = False
        self.phase           = "ask"     # "ask" | "guess"
        self.questions_asked = 0
        self.max_questions   = 20
        self.history_qna     = []        # [(question, answer), …]
        self._asked_set      = set()     # lower-cased questions
        self._guess_set      = set()     # lower-cased guesses
    # --------------------------------------------------------------------- #
    # 1. intent detection helpers
    # --------------------------------------------------------------------- #
    def _detect_game_start(self, user_msg: str) -> bool:
        prompt = [
            {"role": "system",
             "content": ("You are a classifier. Respond with *exactly* YES or NO "
                         "(uppercase, no punctuation) to the following question:\n"
                         "Does the user explicitly ask to PLAY a 20-Questions game?\n"
                         "If they only greet, ask who you are, or anything else, answer NO.")},
            {"role": "user", "content": user_msg.strip()}
        ]
        reply = chat_completion(prompt,
                            max_new=1,
                            temp=0.0,
                            top_p=0.0)
        
        return reply.strip().lower().startswith("y")

    def _detect_quit(self, user_msg: str) -> bool:
        return bool(re.search(r"\b(quit|exit|cancel|stop|end|end game)\b",
                              user_msg, flags=re.I))

    def _generate_question(self) -> str:
       # ------------------------------------------------------------------ #
        # 1) dynamic LLM-generated question
        # ------------------------------------------------------------------ #        
        history_summary = "\\n".join([f"Q: {q} | A: {a}" for q, a in self.history_qna])
        asked_questions = "\\n".join(sorted(self._asked_set))
        
        system_prompt = (
            "You are playing 20 Questions. Your goal is to narrow down the possibilities with a strategic yes/no question. "
            "The yes/no question must be new and not a repeat of one asked before."
            "Ask general yes/no questions note very specific."
            "DO NOT ask for guess or between 2 word."
        )
        user_prompt = (
            f"Here is the conversation history:\\n{history_summary}\\n\\n"
            f"Here are the yes/no questions already asked:\\n{asked_questions}\\n\\n"
            "Based on the history, generate the next logical yes/no question to ask. The yes/no question should be concise and clear (under 12 words)."
        )
        
        prompt = [{"role": "system", "content": system_prompt},
                  {"role": "user",   "content": user_prompt}]
    

        candidate = chat_completion(prompt, max_new=20, temp=0.4).strip()
        candidate = candidate.split("?")[0].strip().capitalize() + "?"
        if (2 <= len(candidate.split()) <= 20
                and candidate.lower() not in self._asked_set):
            self._asked_set.add(candidate.lower())
            return candidate
    
        # ------------------------------------------------------------------ #
        # 2) ultimate static fall-back
        # ------------------------------------------------------------------ #
        FALLBACKS = [
            "Is it man-made?",
            "Is it bigger than a loaf of bread?",
            "Is it commonly found indoors?",
            "Is it electronic?",
        ]
        for fb in FALLBACKS:
            fb_l = fb.lower()
            if fb_l not in self._asked_set:
                self._asked_set.add(fb_l)
                return fb
    
        # If absolutely everything else fails
        return "Is it tangible?"
    
    def _generate_guess(self) -> str:

        history_summary = "\\n".join([f"Q: {q} | A: {a}" for q, a in self.history_qna])
        forbidden_guesses = ", ".join(self._guess_set)
        
        system_prompt = (
            "You are playing 20 Questions. Your task is to guess the secret word. "
            "Based on the history of questions and answers, provide a single-word guess with no explanation."
        )
        user_prompt = (
            f"Here is the history so far:\\n{history_summary}\\n\\n"
            f"Previously guessed words were: {forbidden_guesses}\\n\\n"
            "According to this, Guess new ONE-WORD? (one word only)"
        )
        prompt = [{"role": "system", "content": system_prompt},
                  {"role": "user",   "content": user_prompt or "[start]"}]

        g = chat_completion(prompt, max_new=4, temp=0.2).strip().split()[0]
        g = re.sub(r"[^a-zA-Z]", "", g).lower()
        if (g not in self._guess_set):
            return g
        for fb in GUESS_FALLBACKS:
            if fb not in self._guess_set:
                return fb
        return "idea"
    # --------------------------------------------------------------------- #
    # 3. public ask()  (state machine)
    # --------------------------------------------------------------------- #
    def ask(self, user_msg: str, k: int = 3) -> str:
        # -------- escape hatch --------
        if self.game_active and self._detect_quit(user_msg):
            self._init_game_state()
            return "Game stopped. Say 'play 20 questions' to start again."
        # -------- active game ---------
        if self.game_active:
            # user answered a QUESTION  → now make a GUESS
            if self.phase == "ask":
                self.history_qna[-1] = (self.history_qna[-1][0], user_msg.strip())
                guess = self._generate_guess()
                self._guess_set.add(guess.lower())
                self.phase = "guess"
                # return f"Is it **{guess}**? (Yes/No)"
                return guess.lower()
            # user judged our GUESS
            if self.phase == "guess":
                if user_msg.lower().startswith("yes"):
                    self._init_game_state()
                    return "🎉 I guessed it! Thanks for playing."
                # wrong guess
                self.questions_asked += 1
                if self.questions_asked >= self.max_questions:
                    self._init_game_state()
                    return "😔 I couldn't get it in 20 tries. You win!"
                # next question
                q = self._generate_question()
                self.history_qna.append((q, None))
                self._asked_set.add(q.lower())
                self.phase = "ask"
                return q + " (Yes/No)"
        # -------- start trigger -------
        if self._detect_game_start(user_msg):
            self._init_game_state()
            self.game_active = True
            first_q = self._generate_question()
            self.history_qna.append((first_q, None))
            self._asked_set.add(first_q.lower())
            intro = ("Let's play 20 Questions! Think of a word; "
                     "I'll guess in ≤20 yes/no questions.")
            return intro + "\n\n" + first_q + " (Yes/No)"
        # -------- normal chat ---------
        return super().ask(user_msg, k=k)


In [31]:
# bot = GameWebRAGMemoryChatbot(
#         exa_api_key=os.getenv("EXA_API_KEY"),
#         retriever=retriever,
#         system_prompt=SYSTEM_PROMPT
# )

# # user_ans = "Let's play 20 questions"
# # user_ans = "What is your name?"
# # user_ans = "Who are you?"
# # user_ans = "I don't want play 20 questions."
# user_ans = "I wanna play 20 question."
# querries = []
# i = 0
# while user_ans != "q":
#     model_ques = bot.ask(user_ans)
#     print(model_ques)
#     user_ans = input("what is your answer?")
#     # user_ans = "no"
    
#     querries.append({"Model Question":model_ques,"User answer":user_ans})
#     # print("Model Question: ",model_ques,", your answer is No")
#     # user_ans = "no"
#     # if i > 45:
#     #     break
#     # i += 1

In [32]:
# bot = GameWebRAGMemoryChatbot(
#         exa_api_key=os.getenv("EXA_API_KEY"),
#         retriever=retriever,
#         system_prompt=SYSTEM_PROMPT
# )

# user_ans = input("How can I help you?\n")
# querries = []

# while user_ans != "q":
#     model_ans = bot.ask(user_ans)
#     print(model_ans)
#     user_ans = input()

#     querries.append({"Model Question":model_ques,"User answer":user_ans})


In [33]:
# for q in querries:
#     print("Model question: ", q['Model Question'])
#     print("User answer: ", q['User answer'])

Validator

In [34]:
%%writefile validator_model.py

# ==== part0 ====
# !pip install --quiet transformers accelerate bitsandbytes sentence-transformers #faiss-cpu
# !pip install faiss-gpu-cu12

#===== part1 =====
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch, gc, os
from huggingface_hub import login
import json, gc, os, time
from collections import deque
from typing import List, Dict

login("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")

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

bnb_cfg = BitsAndBytesConfig(load_in_4bit=True,
                             bnb_4bit_compute_dtype=torch.float16,
                             bnb_4bit_use_double_quant=True,
                             bnb_4bit_quant_type="nf4")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_cfg,
    trust_remote_code=True,
).eval()

# ==== part2 ====

def Ask_AI(messages: List[Dict],  # messages[-1] must be user
                    max_new=256, temp=0.7, top_p=0.9):
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new,
        temperature=temp,
        top_p=top_p,
        eos_token_id=tokenizer.eos_token_id
    )
    reply = tokenizer.decode(out[0][inputs.input_ids.shape[1]:],
                             skip_special_tokens=True).strip()
    return reply

# ===== part3 =====

game_words = [
    "cat", "dog", "cow", "horse", "rabbit", "lion", "bear", "shark", "eagle", "ant",
    "apple", "banana", "orange", "carrot", "bread", "cheese", "pizza", "cookie", "egg", "ice-cream",
    "chair", "table", "sofa", "bed", "lamp", "clock", "mirror", "door", "window", "carpet",
    "car", "bicycle", "bus", "train", "airplane", "boat", "rocket", "helmet", "engine", "wheel",
    "pencil", "pen", "book", "paper", "scissors", "ruler", "eraser", "backpack", "laptop", "phone",
    "ball", "doll", "puzzle", "kite", "yo-yo", "drum", "guitar", "camera", "radio", "television",
    "shirt", "pants", "jacket", "hat", "shoes", "gloves", "umbrella", "wallet", "watch", "glasses",
    "moon", "sun", "star", "cloud", "rain", "snow", "mountain", "river", "ocean", "island",
    "doctor", "teacher", "chef", "farmer", "artist", "pilot", "police", "firefighter", "singer", "dancer",
    "gold", "silver", "iron", "sand", "water", "oil", "soap", "sugar", "salt", "honey"
]

class ValidatorModel:
    def __init__(self,words = game_words):
        self.word_list = words
        self.keyword = words[0]
        self.turn = 0
        self.guess_number = 0
        
    def validate_question(self,question):        
        system_prompt = (f"Let's play 20 Questions. You are playing the role of the Answerer.The keyword is **{self.keyword}**.")
        
        user_prompt = (
            f'''The question is about the keyword **{self.keyword}** .
            Give yes-or-no answer about the keyword and surround your answer with just one word, like yes or no.
            The Question is **{question}**'''
        )
        
        prompt = [{"role": "system", "content": system_prompt},
                  {"role": "user",   "content": user_prompt}]
        model_ans = Ask_AI(prompt, max_new=4, temp=0).strip().split()[0]
        return model_ans

    def validate_guess(self,guess):
        # system_prompt = (f"Let's play 20 Questions. You are playing the role of guess validator.The keyword is {self.keyword}.")
        
        # user_prompt = (
        #     f"The keyword is {self.keyword} and the guess is {guess}.Give yes answer if the guess is exaclty same with keyword. Surround your answer with just one word, like yes or no."
        # )
        # prompt = [{"role": "system", "content": system_prompt},
        #           {"role": "user",   "content": user_prompt}]

        # model_ans = chat_completion(prompt, max_new=4, temp=0).strip().split()[0]
        
        # return model_ans
        self.guess_number += 1
        out = 'Yes' if guess.lower() == self.keyword.lower() else 'No'
        if out.lower() == 'yes' or self.guess_number == 20:
            self.turn += 1
            self.keyword = (self.word_list)[self.turn]
            self.guess_number = 0
            
        return(out)

Overwriting validator_model.py


In [35]:
# %load validator_model.py

# Creating List of One-syllable Common Nouns for Evaluation

In [41]:
import requests
import torch
import torch.nn.functional as F
import time
import math
import os
import nltk
from nltk.corpus import wordnet, stopwords, cmudict

# --- Step 1: Download and Prepare Word Lists ---

# Download Google's 10k common words list
try:
    url = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt"
    response = requests.get(url)
    response.raise_for_status()
    google_common_words = response.text.splitlines()
    print(f"Successfully loaded {len(google_common_words)} common words from Google's list.")
except requests.exceptions.RequestException as e:
    print(f"Error downloading Google's word list: {e}")
    google_common_words = []

# Download NLTK's WordNet for noun checking
try:
    wordnet.ensure_loaded()
except LookupError:
    print("Downloading 'wordnet' corpus...")
    nltk.download('wordnet')

# Download NLTK's stopwords list for filtering
try:
    stopwords.ensure_loaded()
except LookupError:
    print("Downloading 'stopwords' corpus...")
    nltk.download('stopwords')

try:
    # The CMU Pronouncing Dictionary for syllable counting
    arpabet = cmudict.dict()
except LookupError:
    print("Downloading cmudict...")
    nltk.download('cmudict')
    arpabet = cmudict.dict()


def count_syllables(word):
    """Counts syllables by checking vowel sounds in the CMU Pronouncing Dictionary."""
    word = word.lower()
    if word not in arpabet:
        return 0  # Cannot determine syllable count for words not in the dictionary

    # Count the number of phonemes that are vowels (indicated by a digit)
    pronunciation = arpabet[word][0]
    syllable_count = 0
    for phoneme in pronunciation:
        if phoneme[-1].isdigit():
            syllable_count += 1
    return syllable_count

# Create a set of all nouns from WordNet for very fast lookups
wordnet_nouns = {word for synset in wordnet.all_synsets('n') for word in synset.lemma_names()}
print(f"Loaded {len(wordnet_nouns)} unique nouns from WordNet.")

# Create a set of English stop words
stop_words = set(stopwords.words('english'))
print(f"Loaded {len(stop_words)} English stop words.")

# --- Step 2: Filter the Common Words to Get a Noun-Only, Stopword-Free List ---
common_nouns = []
if google_common_words and wordnet_nouns:
    for word in google_common_words:
        # Check if the word is a noun AND not a stop word
        if word in wordnet_nouns and word not in stop_words and count_syllables(word) == 1:
            common_nouns.append(word)
print(f"Filtered down to {len(common_nouns)} common nouns (after removing stopwords).")

# --- Step 3: Filter Tokenizer Vocabulary to Match the Noun List ---
# This gives us the exact token IDs we want to analyze.
print("Matching common nouns to the tokenizer's vocabulary...")
vocab = tokenizer.get_vocab()
filtered_token_ids = []
final_nouns = []
for noun in common_nouns:
    # A token can exist with or without a leading space ('Ġ' or ' '). We check for both.
    encoded = tokenizer.encode(f" {noun}", add_special_tokens=False)
    if len(encoded) == 1:
        final_nouns.append(noun)
        filtered_token_ids.append(encoded[0])
        # break # Move to the next noun once we find a match

final_nouns = list(set(final_nouns))
# Remove duplicates that might arise from the loop
filtered_token_ids = sorted(list(set(filtered_token_ids)))
print(f"Found {len(filtered_token_ids)} corresponding tokens to process.")

# --- Step 3: Save the Final, High-Quality Filtered List to a File ---
output_filename = "words.txt"
with open(output_filename, 'w', encoding='utf-8') as f:
    for noun in final_nouns:
        f.write(noun + '\n')
print(f"Saved the final filtered list of nouns to '{output_filename}'.")

Successfully loaded 9894 common words from Google's list.
Loaded 119034 unique nouns from WordNet.
Loaded 198 English stop words.
Filtered down to 1411 common nouns (after removing stopwords).
Matching common nouns to the tokenizer's vocabulary...
Found 1242 corresponding tokens to process.
Saved the final filtered list of nouns to 'words.txt'.


In [42]:
from validator_model  import ValidatorModel
# validator = ValidatorModel() # Using game_words list (default)
validator = ValidatorModel(final_nouns) # Using final_nouns list

In [43]:
bot = GameWebRAGMemoryChatbot(
    exa_api_key=os.getenv("EXA_API_KEY"),
    retriever=retriever,
    system_prompt=SYSTEM_PROMPT
)
q_all = []
win = 0
print_flag = False
# for keyword in game_words:
Number_of_games = input("How many games do you want?(<100)")
Number_of_games = int(Number_of_games)
Number_of_games = Number_of_games if Number_of_games <= 100 else 100
print(f"Playing {Number_of_games} games: ")
for _ in range(Number_of_games):
    # validator.keyword = keyword
    print("New keyword is: ",validator.keyword)
    validator_guess_ans = "I wanna play 20 question."
    querries = []
    while True:
        model_ques = bot.ask(validator_guess_ans)
        if model_ques == "🎉 I guessed it! Thanks for playing." or model_ques == "😔 I couldn't get it in 20 tries. You win!":
            if model_ques == "🎉 I guessed it! Thanks for playing.":
                win += 1
            print(model_ques)
            if print_flag:
                print(model_ques)
            break
        if print_flag:
            print(model_ques)
        validator_question_ans = validator.validate_question(model_ques)
        if print_flag:
            print(validator_question_ans)
        
        model_guess = bot.ask(validator_question_ans)
        if print_flag:
            print(model_guess)
        validator_guess_ans =  validator.validate_guess(model_guess)
        if print_flag:
            print(validator_guess_ans)
        querries.append({"Model Question":model_ques,"validator answer":validator_question_ans
                        ,"Model guess":model_guess ,"Validator answer_g":validator_guess_ans })
    q_all.append(querries)

How many games do you want?(<100) 10


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Playing 10 games: 
New keyword is:  six


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  town


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  shirt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  time


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  sub


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  text


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  ant


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

🎉 I guessed it! Thanks for playing.
New keyword is:  rough


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  whole


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!
New keyword is:  ten


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_

😔 I couldn't get it in 20 tries. You win!


In [51]:
print(f"{win} games won out of {Number_of_games} games")
for i,qu in enumerate(q_all):
    print("\nWord:",validator.word_list[i])
    for q in qu:
        print("Model question: ", q['Model Question'])
        print("Validator answer: ", q['validator answer'])
        print("Model guess: ",q['Model guess'])
        print("Validator answer: ",q['Validator answer_g'])

1 games won out of 10 games

Word: six
Model question:  Let's play 20 Questions! Think of a word; I'll guess in ≤20 yes/no questions.

Is it animate (living)? (Yes/No)
Validator answer:  No.
Model guess:  machine
Validator answer:  No
Model question:  Is it a solid object? (Yes/No)
Validator answer:  No
Model guess:  network
Validator answer:  No
Model question:  Q: is it a liquid or a gas? (Yes/No)
Validator answer:  No
Model guess:  mineral
Validator answer:  No
Model question:  Q: is it an electrical or magnetic entity? (Yes/No)
Validator answer:  No
Model guess:  space
Validator answer:  No
Model question:  Q: is it a substance that can be found in nature? (Yes/No)
Validator answer:  No.
Model guess:  data
Validator answer:  No
Model question:  Q: is it a man-made or artificial entity? (Yes/No)
Validator answer:  No
Model guess:  code
Validator answer:  No
Model question:  Q: is it a mathematical or logical concept? (Yes/No)
Validator answer:  Yes
Model guess:  algorithm
Validator 

# Part 4: UI interface 

In [None]:
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

bot = GameWebRAGMemoryChatbot(
    exa_api_key=os.getenv("EXA_API_KEY"),
    retriever=retriever,
    system_prompt=SYSTEM_PROMPT
)
# 2) ── Define the reply function Gradio will call ───────────────────────────────
def respond(message: str, history: list[tuple[str, str]]):
    """
    Parameters
    ----------
    message : latest user message
    history : list of (user, bot) tuples so far

    Returns
    -------
    assistant_response : str  (Gradio will append this to the chat)
    """
    assistant_text = bot.ask(message)
    return assistant_text

# 3) ── Spin up the Chat UI - it’s one line! ─────────────────────────────────────
demo = gr.ChatInterface(
    fn=respond,
    title="🔮  My Local LLM Chat",
    description="Ask me anything – I’m running on your GPU/CPU.",
    examples=["Hello!", "Explain reinforcement learning in 2 lines.", "چطور می‌توانم پرامپت بهتری بسازم؟"]
)

if __name__ == "__main__":
    demo.launch()


# My version

In [None]:
CHUNK_SIZE   = 700                                    # characters per chunk
TOP_K        = 3                                      # retrieve this many chunks
# retriever  = None            # will hold an instance once a PDF is uploaded
current_pdf_name = None      # just for the UI title/description
embedder   = SentenceTransformer("all-MiniLM-L6-v2")

bot = GameWebRAGMemoryChatbot(
    exa_api_key=os.getenv("EXA_API_KEY"),
    retriever=None,
    system_prompt=SYSTEM_PROMPT
)

class SimpleRetriever:
    """Holds the FAISS index + metadata for one document."""
    def __init__(self, vecs: np.ndarray, chunks: list[str]):
        self.index = faiss.IndexFlatL2(vecs.shape[1])
        self.index.add(vecs)
        self.chunks = chunks
        # all vectors encoded by MiniLM have the same length; save norm once
        self._norm = np.linalg.norm(vecs[0])

    def _sim(self, l2: float) -> float:          # convert L2 → cosine-like score
        return max(0.0, 1.0 - l2 / (2 * self._norm ** 2))

    def top_k(self, query: str, k: int = TOP_K):
        q = embedder.encode([query]).astype("float32")
        D, I = self.index.search(q, k)
        hits = []
        for rank, (idx, dist) in enumerate(zip(I[0], D[0])):
            if idx == -1:                 # should not happen here
                continue
            hits.append({
                "rank": rank + 1,
                "sim" : round(self._sim(dist), 3),
                "text": self.chunks[idx]
            })
        print(hits)
        return hits

def load_pdf(fileobj) -> str:
    """Extract all text from an uploaded PDF file-like object."""
    with fitz.open(fileobj.name) as doc:
        return "\n".join(page.get_text() for page in doc)
        
def clean(text: str) -> str:
    """Minimal cleanup – collapse whitespace, keep punctuation."""
    text = re.sub(r"\s+", " ", text).strip()
    return text

def chunk_text(text: str, size: int = CHUNK_SIZE):
    for start in range(0, len(text), size):
        yield text[start : start + size]

# 2) ── Define the reply function Gradio will call ───────────────────────────────
def respond(message: str, history: list[tuple[str, str]]):
    # print(history)
    assistant_text = bot.ask(message)
    return assistant_text

def build_retriever(pdf_file):
    """Create retriever from the uploaded PDF, store it in global state."""
    global current_pdf_name
    text = clean(load_pdf(pdf_file))
    
    chunks = list(chunk_text(text))
    vecs  = embedder.encode(chunks, batch_size=32, show_progress_bar=False).astype("float32")
    
    bot.retriever = SimpleRetriever(vecs, chunks)
    current_pdf_name = os.path.basename(pdf_file.name)
    
    return f"✅ Loaded **{current_pdf_name}** with {len(chunks)} chunks." 


# 3) ── Spin up the Chat UI - it’s one line! ─────────────────────────────────────

with gr.Blocks(title="LLM Chat") as demo:
    gr.Markdown("## Chat features\n"
                "1. You can upload **one** PDF via the file box and ask about it.\n"
                "2. Ask questions – the chat will automatically consult the file.\n"
                "3. You can also ask about web info.\n"
                 "4. You can play 20-question game.")
    upload_box = gr.File(label="📎  Upload a PDF", file_types=[".pdf"], file_count="single")
    status_box = gr.Markdown()
    chat = gr.ChatInterface(
                        fn=respond,
                        title="🔮  My Local LLM Chat",
                        description="Ask me anything – I’m running on your GPU/CPU.",
                        examples=["Hello!", "Explain reinforcement learning in 2 lines.", "چطور می‌توانم پرامپت بهتری بسازم؟"]
                        )
    upload_box.upload(build_retriever, upload_box, status_box)

if __name__ == "__main__":
    demo.launch()