In [None]:
import os
import json
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
from sklearn.metrics.pairwise import cosine_similarity



[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install -q kaggle


In [None]:
!mkdir -p ~/.kaggle
!cp "/content/drive/My Drive/Colab Notebooks/nlp project/kaggle.json" ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d dfydata/wordnet-dictionary-thesaurus-files-in-csv-format


Dataset URL: https://www.kaggle.com/datasets/dfydata/wordnet-dictionary-thesaurus-files-in-csv-format
License(s): CC0-1.0
Downloading wordnet-dictionary-thesaurus-files-in-csv-format.zip to /content
  0% 0.00/11.1M [00:00<?, ?B/s]
100% 11.1M/11.1M [00:00<00:00, 782MB/s]


In [None]:
!unzip -q wordnet-dictionary-thesaurus-files-in-csv-format.zip


In [None]:
!ls -1


drive
sample_data
WordNet-DictionaryThesaurus
wordnet-dictionary-thesaurus-files-in-csv-format.zip


In [None]:
import pandas as pd

syn_df = pd.read_csv('WordNet-DictionaryThesaurus/WordnetSynonyms.csv')
ant_df = pd.read_csv('WordNet-DictionaryThesaurus/WordnetAntonyms.csv')
hyp_df = pd.read_csv('WordNet-DictionaryThesaurus/WordnetHypernyms.csv')

# hypo_df = pd.read_csv('WordNet-DictionaryThesaurus/WordnetHyponyms.csv')

print("Synonyms sample:\n", syn_df.head(), "\n")
print("Antonyms sample:\n", ant_df.head(), "\n")
print("Hypernyms sample:\n", hyp_df.head(), "\n")


Synonyms sample:
      Word  Count        POS                                           Synonyms
0    a-ok      4  satellite                                             a-okay
1  a-okay      6  satellite                                               a-ok
2   a-one      5  satellite  ace;A-one;crack;first-rate;super;tiptop;topnot...
3    a.m.      4  satellite                                      ante meridiem
4    a.m.      4     adverb                                 ante meridiem;A.M. 

Antonyms sample:
            Word  Count        POS      Antonyms
0    a la carte     10  adjective  table d'hote
1  a posteriori     12  adjective      a priori
2  a posteriori     12     adverb      a priori
3      a priori      8  adjective  a posteriori
4      a priori      8     adverb  a posteriori 

Hypernyms sample:
   lemma  Count part_of_speech hypernyms
0     0      1           noun     digit
1  0.22      4           noun   firearm
2     1      1           noun     digit
3     2      1     

In [None]:
synonyms_dict = {}
antonyms_dict = {}
hypernyms_dict = {}

def clean_and_split(entry):
    if not entry or entry == 'nan':
        return set()
    parts = entry.replace(';', '|').split('|')
    return set(p.strip().lower() for p in parts if p.strip().isalpha())

# --- Building synonyms_dict from WordnetSynonyms.csv ---
for _, row in syn_df.iterrows():
    word = str(row["Word"]).strip().lower()
    synonym_str = str(row["Synonyms"]).strip().lower()
    if not word or synonym_str == 'nan':
        continue
    synonyms = clean_and_split(synonym_str)
    for syn in synonyms:
        if syn != word:
            synonyms_dict.setdefault(word, set()).add(syn)
            synonyms_dict.setdefault(syn, set()).add(word)  # symmetric

# --- Building antonyms_dict from WordnetAntonyms.csv ---
for _, row in ant_df.iterrows():
    word = str(row["Word"]).strip().lower()
    antonym_str = str(row["Antonyms"]).strip().lower()
    if not word or not antonym_str:
        continue
    antonyms = clean_and_split(antonym_str)
    for ant in antonyms:
        if ant != word:
            antonyms_dict.setdefault(word, set()).add(ant)
            antonyms_dict.setdefault(ant, set()).add(word)  # symmetric

# --- Building hypernyms_dict from WordnetHypernyms.csv ---
for _, row in hyp_df.iterrows():
    word = str(row["lemma"]).strip().lower()
    hypernym_str = str(row["hypernyms"]).strip().lower()
    if not word or not word.isalpha() or not hypernym_str:
        continue
    hypernyms = clean_and_split(hypernym_str)
    for hyper in hypernyms:
        if hyper != word:
            hypernyms_dict.setdefault(word, set()).add(hyper)

print(f"✅ Cleaned and loaded {len(synonyms_dict)} words with synonyms")
print(f"✅ Cleaned and loaded {len(antonyms_dict)} words with antonyms")
print(f"✅ Cleaned and loaded {len(hypernyms_dict)} words with hypernyms")


✅ Cleaned and loaded 89157 words with synonyms
✅ Cleaned and loaded 9816 words with antonyms
✅ Cleaned and loaded 45256 words with hypernyms


In [None]:
print("📌 Clean synonyms for 'smart':", synonyms_dict.get("smart", []))
print("📌 Clean antonyms for 'hot':", antonyms_dict.get("hot", []))
print("📌 Clean hypernyms for 'dog':", hypernyms_dict.get("dog", []))


📌 Clean synonyms for 'smart': {'hurt', 'smartness', 'fresh', 'chic', 'impertinent', 'ache', 'fresher', 'sassy', 'impudent', 'wise', 'voguish', 'aching', 'saucy', 'hurting', 'bright', 'smarting', 'overbold'}
📌 Clean antonyms for 'hot': {'cold'}
📌 Clean hypernyms for 'dog': {'villain', 'canine', 'pursue', 'chap', 'support', 'catch', 'sausage'}


In [None]:
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove/


In [None]:
# Building set of words used in our relations
vocab_of_interest = set()

# Including all base words (keys)
vocab_of_interest.update(synonyms_dict.keys())
vocab_of_interest.update(antonyms_dict.keys())
vocab_of_interest.update(hypernyms_dict.keys())

for rel_dict in [synonyms_dict, antonyms_dict, hypernyms_dict]:
    for related_words in rel_dict.values():
        vocab_of_interest.update(related_words)

# Cleaning non-alphabetic entries
vocab_of_interest = {w for w in vocab_of_interest if w.isalpha()}
print(f"🧠 Vocab of interest has {len(vocab_of_interest)} clean single-word entries.")


🧠 Vocab of interest has 70232 clean single-word entries.


In [None]:
import numpy as np


glove_path = "glove/glove.6B.100d.txt"
embedding_dim = 100
word_vectors = {}

with open(glove_path, 'r', encoding='utf8') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) != embedding_dim + 1:
            continue
        word = parts[0]
        if word in vocab_of_interest:
            vec = np.array(parts[1:], dtype='float32')
            word_vectors[word] = vec

print(f"✅ Loaded {len(word_vectors)} filtered word vectors.")


✅ Loaded 48623 filtered word vectors.


In [None]:
print("📌 Vector for 'apple':", word_vectors.get("apple", "Not found"))
print("📌 Vector shape:", word_vectors["apple"].shape if "apple" in word_vectors else "N/A")


📌 Vector for 'apple': [-0.5985    -0.46321    0.13001   -0.019576   0.4603    -0.3018
  0.8977    -0.65634    0.66858   -0.49164    0.037557  -0.050889
  0.6451    -0.53882   -0.3765    -0.04312    0.51384    0.17783
  0.28596    0.92063   -0.49349   -0.48583    0.61321    0.78211
  0.19254    0.91228   -0.055596  -0.12512   -0.65688    0.068557
  0.55629    1.611     -0.0073642 -0.48879    0.45493    0.96105
 -0.063369   0.17432    0.9814    -1.3125    -0.15801   -0.54301
 -0.13888   -0.26146   -0.3691     0.26844   -0.24375   -0.19484
  0.62583   -0.7377     0.38351   -0.75004   -0.39053    0.091498
 -0.36591   -1.4715    -0.45228    0.2256     1.1412    -0.38526
 -0.06716    0.57288   -0.39191    0.31302   -0.29235   -0.96157
  0.15154   -0.21659    0.25103    0.096967   0.2843     1.4296
 -0.50565   -0.51374   -0.47218    0.32036    0.023149   0.22623
 -0.09725    0.82126    0.92599   -1.0086    -0.38639    0.86408
 -1.206     -0.28528    0.2265    -0.38773    0.40879    0.59303
  

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    if vec1 is None or vec2 is None:
        return 0.0
    return cosine_sim(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

def get_candidates_for_targets(target_words):
    """
    Given a list of target words, gather a set of candidate clue words
    that are related to these targets via synonyms or hypernyms.
    """
    candidates = set()
    for w in target_words:
        w = w.lower()
        if w in synonyms_dict:
            candidates.update(synonyms_dict[w])
        if w in hypernyms_dict:
            candidates.update(hypernyms_dict[w])

    candidates -= set([t.lower() for t in target_words])

    filtered = set()
    for c in candidates:
        if ' ' in c:
            continue
        if c in word_vectors:
            filtered.add(c)

    return filtered

sample_targets = ["dog", "cat"]
print("🎯 Candidate clues for [dog, cat]:", get_candidates_for_targets(sample_targets))


🎯 Candidate clues for [dog, cat]: {'tailing', 'spew', 'purging', 'bounder', 'click', 'chased', 'blackguard', 'chuck', 'detent', 'trailing', 'purge', 'tail', 'cad', 'vomit', 'stimulant', 'trail', 'barf', 'track', 'chase', 'hound', 'catch', 'kat', 'guy', 'khat', 'gossip', 'canine', 'ct', 'heel', 'sick', 'quat', 'weenie', 'hotdog', 'bozo', 'vomiting', 'frankfurter', 'regurgitate', 'honk', 'woman', 'tagged', 'dogging', 'villain', 'feline', 'caterpillar', 'wiener', 'frank', 'puke', 'casting', 'chap', 'retch', 'excrete', 'tails', 'dogged', 'puking', 'cast', 'flog', 'frump', 'man', 'disgorge', 'hombre', 'tracked', 'tracking', 'tag', 'qat', 'pursue', 'support', 'tailed', 'pawl', 'whip', 'sausage'}


In [None]:
import random

class Board:
    def __init__(self, words, first_team='blue'):
        """
        Initialize the board with a given list of words and assign roles.
        `words` is a list of unique words to be used on the board.
        `first_team` is the team that has the extra word (typically 'blue' or 'red').
        """
        assert len(words) >= 25, "Need at least 25 words to populate the board."
        board_words = random.sample(words, 25)

        roles = ['blue'] * (9 if first_team == 'blue' else 8) \
              + ['red'] * (9 if first_team == 'red' else 8) \
              + ['neutral'] * 7 + ['assassin']
        random.shuffle(roles)

        self.word_roles = {word: roles[i] for i, word in enumerate(board_words)}
        self.revealed = {word: False for word in board_words}
        self.first_team = first_team

    def reveal_word(self, word):
        """Mark a word as revealed (guessed)."""
        if word in self.revealed:
            self.revealed[word] = True

    def get_role(self, word):
        """Get the secret role of a word (e.g., 'blue', 'red', 'neutral', 'assassin')."""
        return self.word_roles.get(word, None)

    def remaining_words(self, team=None):
        """Return list of unrevealed words, optionally filtered by team."""
        words = [w for w, rev in self.revealed.items() if not rev]
        if team:
            return [w for w in words if self.word_roles[w] == team]
        return words

    def all_words(self):
        """Return all board words (revealed or not)."""
        return list(self.word_roles.keys())

    def __str__(self):
        """For debugging: show all word roles (revealed or not)."""
        return "\n".join([f"{w}: {self.word_roles[w]}{' [REVEALED]' if self.revealed[w] else ''}" for w in self.word_roles])


In [None]:
board_words = list(word_vectors.keys())
test_board = Board(board_words, first_team='blue')

print("🔹 Words assigned to roles:")
print(test_board)


🔹 Words assigned to roles:
oversupply: blue
birdcage: neutral
agave: blue
concentric: red
beefeater: neutral
cosmographer: red
claudius: red
obscenely: blue
sheltered: neutral
deviationist: neutral
elmwood: red
garcinia: blue
katabatic: blue
tolu: blue
band: blue
bodacious: blue
prowl: red
fbi: blue
lodgepole: red
bobwhite: assassin
fireball: red
misunderstood: neutral
refilling: neutral
conduction: neutral
brainpower: red


In [None]:
from nltk.corpus import wordnet as wn
import random

class ClueGenerator:
    def __init__(self, embedding_dict):
        self.embedding = embedding_dict

    def score_clue(self, clue, target_words, other_words):
        """Score clue: high for closeness to targets, low for others."""
        vec_c = self.embedding.get(clue)
        if vec_c is None:
            return -float('inf')
        pos_score = sum(
            cosine_simy(vec_c.reshape(1, -1), self.embedding[t].reshape(1, -1))[0][0]
            for t in target_words if t in self.embedding
        )
        neg_score = sum(
            cosine_sim(vec_c.reshape(1, -1), self.embedding[o].reshape(1, -1))[0][0]
            for o in other_words if o in self.embedding
        )
        return pos_score - neg_score

    def generate_clue(self, board, team):
        """Generate a (clue, number) pair based on current board state."""
        team_words = board.remaining_words(team=team)
        other_words = [w for w in board.remaining_words() if w not in team_words]
        team_words = [w.lower() for w in team_words]
        other_words = [w.lower() for w in other_words]


        candidates = get_candidates_for_targets(team_words)
        if not candidates:
            # Fallback: using WordNet hypernyms
            for w in team_words:
                for syn in wn.synsets(w):
                    for hyper in syn.hypernyms():
                        candidates.add(hyper.lemmas()[0].name().lower())
            if not candidates:
                candidates = vocab_of_interest - set([w.lower() for w in board.word_roles.keys()])

        # Step 2: filtering clues that are on board (illegal)
        legal_candidates = [
            c for c in candidates if c not in [w.lower() for w in board.word_roles.keys()]
        ]

        # Step 3: scoring and select the best clue
        best_clue = None
        best_score = -float('inf')
        for clue in legal_candidates:
            score = self.score_clue(clue, team_words, other_words)
            if score > best_score:
                best_score = score
                best_clue = clue

        # Step 4: counting how many targets the clue likely points to
        number = 0
        if best_clue:
            vec_c = self.embedding.get(best_clue)
            for t in team_words:
                vec_t = self.embedding.get(t)
                if vec_t is not None:
                    sim = cosine_sim(vec_c.reshape(1, -1), vec_t.reshape(1, -1))[0][0]
                    if sim >= 0.3:
                        number += 1
        else:
            best_clue = random.choice(team_words)
            number = 1

        return best_clue, number


In [None]:
cg = ClueGenerator(word_vectors)
clue, num = cg.generate_clue(test_board, team='blue')
print(f"🧠 Suggested clue: '{clue}' for {num} word(s).")


🧠 Suggested clue: 'flood' for 0 word(s).


In [None]:
class Guesser:
    def __init__(self, embedding_dict):
        self.embedding = embedding_dict

    def guess_words(self, clue_word, number, board):
        """
        Given a clue and a number, return a list of up to 'number' guesses for the board.
        """
        clue_vec = self.embedding.get(clue_word)
        if clue_vec is None:
            return []

        scores = []
        for word, revealed in board.revealed.items():
            if revealed:
                continue
            vec_w = self.embedding.get(word.lower())
            if vec_w is None:
                continue
            sim = cosine_sim(clue_vec.reshape(1, -1), vec_w.reshape(1, -1))[0][0]
            scores.append((sim, word))

        scores.sort(reverse=True)
        return [w for _, w in scores[:number]]


In [None]:
clue_gen = ClueGenerator(word_vectors)
guesser = Guesser(word_vectors)

clue, number = clue_gen.generate_clue(test_board, team='blue')
print(f"🤖 Clue: '{clue}' ({number})")

guesses = guesser.guess_words(clue, number, test_board)
print("🎯 Guesser's guesses:", guesses)


🤖 Clue: 'flood' (0)
🎯 Guesser's guesses: []


In [None]:
class RLClueAgent(ClueGenerator):
    def __init__(self, embedding_dict):
        super().__init__(embedding_dict)
        #  bunu sonra ekle policy network, memory
    def generate_clue(self, board, team):
        #  bunu da RL policy here
        return super().generate_clue(board, team)

        #
        # candidates = list(vocab_of_interest - set([w.lower() for w in board.word_roles.keys()]))
        # if candidates:
        #     return random.choice(candidates), 1
        # else:
        #     return super().generate_clue(board, team)


In [None]:
all_words_list = [w for w in synonyms_dict.keys() if w in word_vectors]
all_words_list = [w for w in all_words_list if ' ' not in w and '_' not in w]

game_board = Board(words=all_words_list, first_team='blue')
print("🧩 Game Board (Secret Roles Shown Below - Normally Hidden):")
print(game_board)

rl_agent = RLClueAgent(word_vectors)
guesser = Guesser(word_vectors)

clue_word, clue_number = rl_agent.generate_clue(game_board, team='blue')
print(f"\n🧠 Clue for BLUE team: \"{clue_word}\" with number {clue_number}")

guesses = guesser.guess_words(clue_word, clue_number, game_board)
print(f"\n🎯 Guesser's guesses (up to {clue_number}): {guesses}")

for guess in guesses:
    game_board.reveal_word(guess)
    role = game_board.get_role(guess)
    result = "✅ CORRECT" if role == 'blue' else ("❌ OPPONENT" if role == 'red' else "⚠️ " + role.upper())
    print(f"🔍 Guess \"{guess}\": {role.upper()} → {result}")


🧩 Game Board (Secret Roles Shown Below - Normally Hidden):
gobbler: assassin
profusion: neutral
jimmy: red
releasing: blue
pericardial: red
tea: blue
ascendent: red
investiture: neutral
colic: blue
decapoda: red
valorous: red
biography: blue
larch: blue
roble: neutral
mimic: red
nonconformist: blue
algeria: blue
bikini: red
rapture: red
paspalum: neutral
slickly: neutral
bosomy: neutral
caldron: blue
grapple: blue
willebrand: neutral

🧠 Clue for BLUE team: "issue" with number 2

🎯 Guesser's guesses (up to 2): ['releasing', 'biography']
🔍 Guess "releasing": BLUE → ✅ CORRECT
🔍 Guess "biography": BLUE → ✅ CORRECT


Preprocessing: Loading GloVe Embeddings


In [None]:
import numpy as np

embeddings = {}
glove_path = "glove/glove.6B.100d.txt"
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vector = np.array(parts[1:], dtype=float)
        embeddings[word] = vector

print(f"Loaded {len(embeddings)} word vectors.")



Loaded 400000 word vectors.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def cosine_sim(vec1, vec2):
    """
    Compute cosine similarity between two vectors, ensuring both are 1D (100,).
    Returns a float.
    """
    vec1 = np.asarray(vec1).flatten()
    vec2 = np.asarray(vec2).flatten()
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]


In [None]:
class Guesser:
    def __init__(self, embeddings):
        self.embeddings = embeddings  # dictionary of word -> vector

    def predict_guesses(self, clue_word, clue_number, board):
        key = clue_word.lower()
        if key not in self.embeddings:
            raise ValueError(f"Clue word '{clue_word}' not found in embeddings.")
        clue_vec = self.embeddings[key]

        unrevealed_words = board.get_unrevealed_words()
        similarities = []
        for word in unrevealed_words:
            vec = self.embeddings.get(word.lower())
            if vec is None:
                continue
            #  cosine similarity between clue and this board word
            score = cosine_sim(clue_vec, vec)
            similarities.append((word, score))
        similarities.sort(key=lambda x: x[1], reverse=True)

        top_n = clue_number if clue_number <= len(similarities) else len(similarities)
        top_guesses = [word for word, score in similarities[:top_n]]
        return top_guesses


In [None]:
def show_board(board):
    """
    Print unrevealed board words (as the human would see them).
    """
    unrevealed = [w for w, rev in board.revealed.items() if not rev]
    print("\nCurrent Board Words:")
    for i, word in enumerate(unrevealed, 1):
        print(f"{i:2d}. {word}")
    print("-" * 30)

board = game_board  # or board = test_board
show_board(board)

clue_input = input("Enter your clue (format: word number): ").strip()
if not clue_input:
    raise RuntimeError("No clue provided.")
parts = clue_input.split()
clue_word = parts[0]
try:
    clue_number = int(parts[1])
except (IndexError, ValueError):
    raise ValueError("Clue format invalid. Please provide a word and a number (e.g., 'tree 2').")

guessed_words = guesser.guess_words(clue_word, clue_number, board)
print("AI guesses:", guessed_words)

for guess in guessed_words:
    board.reveal_word(guess)
    role = board.get_role(guess)
    print(f"Guessed '{guess}' -> {role.upper()}")
    if role == "assassin" or role == "red":
        print("❌ End of turn!")
        break



Current Board Words:
 1. gobbler
 2. profusion
 3. pericardial
 4. tea
 5. ascendent
 6. investiture
 7. colic
 8. decapoda
 9. valorous
10. larch
11. roble
12. mimic
13. nonconformist
14. algeria
15. bikini
16. rapture
17. paspalum
18. slickly
19. bosomy
20. caldron
21. grapple
22. willebrand
------------------------------
Enter your clue (format: word number): drink 1
AI guesses: ['tea']
Guessed 'tea' -> BLUE
