# Cosine similarity testing for pre-trained text to word embedding model
Checking  similarity scores after trianing Glove model enter 1 for english and 2 for hindi and then enter word,by default using 30k datasets for both hindi and telugu


In [4]:
import numpy as np
import pickle

def load_glove_model(model_dir):
    embeddings = np.load(f"{model_dir}/embeddings.npy")
    with open(f"{model_dir}/word_to_id.pkl", "rb") as f:
        word_to_id = pickle.load(f)
    with open(f"{model_dir}/id_to_word.pkl", "rb") as f:
        id_to_word = pickle.load(f)
    with open(f"{model_dir}/metadata.pkl", "rb") as f:
        metadata = pickle.load(f)
    return embeddings, word_to_id, id_to_word, metadata

print("Select GloVe model:")
print("1. English (30k dataset)")
print("2. Hindi   (30k dataset)")

choice = input("Enter choice (1 or 2, default=1): ").strip() or "1"
if choice == "2":
    model_dir = "embeddings/glove_model_hindi_30k"
    test_words = ["प्यार", "स्कूल", "फिल्म", "अच्छा", "खराब", "नायक"]
else:
    model_dir = "wordembeddings/glove_model_30k"
    test_words = ["action", "love", "comedy", "school", "romance",
                  "movie", "film", "good", "bad", "great", "fight"]

try:
    embeddings, word_to_id, id_to_word, metadata = load_glove_model(model_dir)
    print(f"\nModel loaded: {model_dir}")
    print(f"Vocabulary size: {len(word_to_id)}")
    print(f"Embedding dimension: {embeddings.shape[1]}")
except FileNotFoundError:
    print(f"Error: Model directory '{model_dir}' not found.")
    exit(1)

def most_similar_cpu(word, embeddings, word_to_id, id_to_word, top_n=10):
    if word not in word_to_id:
        print(f"'{word}' not in vocabulary")
        return
    target_vec = embeddings[word_to_id[word]]
    similarities = np.dot(embeddings, target_vec)
    norms = np.sqrt(np.sum(embeddings * embeddings, axis=1))
    target_norm = np.sqrt(np.dot(target_vec, target_vec))
    norms = np.where(norms == 0, 1e-8, norms)
    target_norm = target_norm if target_norm != 0 else 1e-8
    similarities = similarities / (norms * target_norm)
    word_idx = word_to_id[word]
    similarities[word_idx] = -1
    top_indices = np.argsort(similarities)[::-1][:top_n]
    print(f"\nMost similar words to '{word}':")
    for i, idx in enumerate(top_indices, 1):
        print(f"{i}. {id_to_word[idx]}: {similarities[idx]:.4f}")

def test_word_similarity(word, embeddings, word_to_id, id_to_word, top_n=5):
    print(f"\nTesting similarity for: {word}")
    most_similar_cpu(word, embeddings, word_to_id, id_to_word, top_n)

def show_vocabulary_sample():
    print("\nVocabulary sample (first 20 words):")
    sample_words = list(word_to_id.keys())[:20]
    for word in sample_words:
        print(word, end=" ")
    print(f"\n... and {len(word_to_id) - 20} more words")

show_vocabulary_sample()

print("\nTesting predefined words:")
for word in test_words:
    test_word_similarity(word, embeddings, word_to_id, id_to_word, top_n=5)

print("\nInteractive mode (press Enter to exit):")
while True:
    try:
        user_word = input("Enter a word: ").strip()
        if not user_word:
            break
        test_word_similarity(user_word, embeddings, word_to_id, id_to_word, top_n=5)
    except KeyboardInterrupt:
        print("\nExiting...")
        break
    except Exception as e:
        print(f"Error: {e}")


Select GloVe model:
1. English (30k dataset)
2. Hindi   (30k dataset)

Model loaded: wordembeddings/glove_model_30k
Vocabulary size: 120821
Embedding dimension: 300

Vocabulary sample (first 20 words):
finis instill hyatt sneezed jari naki dragoons maharani embellishing noodleman batasi retracing enhancing stillo cobain kinema muska inept xing technodrome 
... and 120801 more words

Testing predefined words:

Testing similarity for: action

Most similar words to 'action':
1. devta: 0.7613
2. galvanised: 0.6599
3. hiraasat: 0.6404
4. taqdeer: 0.6271
5. rangbaaz: 0.6222

Testing similarity for: love

Most similar words to 'love':
1. falls: 0.6686
2. fall: 0.6582
3. married: 0.6247
4. marry: 0.6217
5. loves: 0.6194

Testing similarity for: comedy

Most similar words to 'comedy':
1. errors: 0.8015
2. slapstick: 0.7223
3. screwball: 0.6545
4. frothy: 0.6169
5. drama: 0.5875

Testing similarity for: school

Most similar words to 'school':
1. sohoku: 0.7582
2. high: 0.6997
3. teacher: 0.6747


# Classifier testing with pre-trained model for classification 
Custom genre prediction with trained models enter 1 for english and 2 for hindi and then enter text using 30k datasets for both hindi and english by default



In [None]:
#!/usr/bin/env python3
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

torch.backends.mps.is_available() and torch.backends.mps.is_built()
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, pretrained_embeddings=None, filter_sizes=[3,4,5], num_filters=128, dropout=0.5, max_seq_len=600):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.convs = nn.ModuleList([nn.Conv1d(embedding_dim, num_filters, kernel_size=fs) for fs in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(filter_sizes)*num_filters, num_classes)
    def forward(self, x):
        x = self.embedding(x).transpose(1,2)
        conv_outputs = []
        for conv in self.convs:
            pooled = F.max_pool1d(F.relu(conv(x)), kernel_size=conv(x).size(2))
            conv_outputs.append(pooled.squeeze(2))
        x = torch.cat(conv_outputs, dim=1)
        x = self.dropout(x)
        return self.fc(x)

class CNNGenreClassifier:
    def __init__(self, model_path, glove_model_dir):
        self.device = device
        self.embeddings, self.word_to_id, self.id_to_word, self.metadata = self.load_glove_model(glove_model_dir)
        self.model_data = self.load_cnn_model(model_path)
        self.model = self.build_model()
        self.model.eval()
    def load_glove_model(self, model_dir):
        embeddings = np.load(f"{model_dir}/embeddings.npy")
        with open(f"{model_dir}/word_to_id.pkl", "rb") as f: word_to_id = pickle.load(f)
        with open(f"{model_dir}/id_to_word.pkl", "rb") as f: id_to_word = pickle.load(f)
        with open(f"{model_dir}/metadata.pkl", "rb") as f: metadata = pickle.load(f)
        return embeddings, word_to_id, id_to_word, metadata
    def load_cnn_model(self, model_path):
        with open(model_path, "rb") as f: return pickle.load(f)
    def build_model(self):
        model = TextCNN(
            vocab_size=self.model_data['vocab_size'],
            embedding_dim=self.model_data['embedding_dim'],
            num_classes=self.model_data['num_classes'],
            pretrained_embeddings=self.embeddings,
            filter_sizes=self.model_data['filter_sizes'],
            num_filters=self.model_data['num_filters'],
            dropout=self.model_data['dropout'],
            max_seq_len=self.model_data['max_seq_len']
        ).to(self.device)
        model.load_state_dict(self.model_data['model_state'])
        return model
    def text_to_indices(self, text):
        tokens = text.lower().split()
        indices = [self.word_to_id.get(w,0) for w in tokens]
        return indices[:self.model_data['max_seq_len']]
    def predict_genre(self, text, top_k=3):
        indices = self.text_to_indices(text)
        input_tensor = torch.tensor([indices], dtype=torch.long).to(self.device)
        max_len = self.model_data['max_seq_len']
        if input_tensor.size(1) < max_len:
            padding = torch.zeros(1, max_len - input_tensor.size(1), dtype=torch.long).to(self.device)
            input_tensor = torch.cat([input_tensor, padding], dim=1)
        with torch.no_grad():
            output = self.model(input_tensor)
            probabilities = F.softmax(output, dim=1)
        probs_np = probabilities.cpu().numpy()[0]
        top_indices = np.argsort(probs_np)[::-1][:top_k]
        results = {'predicted_genre': self.model_data['idx_to_label'][top_indices[0]], 'confidence': float(probs_np[top_indices[0]]), 'top_predictions': []}
        for i, idx in enumerate(top_indices):
            genre = self.model_data['idx_to_label'][idx]
            prob = float(probs_np[idx])
            results['top_predictions'].append({'rank': i+1, 'genre': genre, 'probability': prob, 'confidence_percent': prob*100})
        print(f"\nInput text length: {len(text.split())} words")
        print(f"Predicted genre: {results['predicted_genre']} (Confidence: {results['confidence']*100:.2f}%)")
        print(f"\nTop {top_k} predictions:")
        for pred in results['top_predictions']:
            bar_length = int(pred['probability']*30)
            bar = '█'*bar_length + '░'*(30-bar_length)
            print(f"  {pred['rank']}. {pred['genre']:<20} {bar} {pred['confidence_percent']:6.2f}%")
        return results
    def interactive_mode(self):
        print("\nINTERACTIVE GENRE PREDICTION MODE")
        while True:
            try:
                text = input("Enter movie plot: ").strip()
                if text.lower() in ['quit','exit','q']: break
                if not text: continue
                if len(text.split()) < 5: continue
                self.predict_genre(text, top_k=5)
                print()
            except KeyboardInterrupt: break
            except Exception as e: print(f"Error: {e}")

def select_language_and_run():
    print("Select language:\n1. English\n2. Hindi")
    choice = input("Enter 1 or 2: ").strip()
    if choice=='1':
        MODEL_PATH = "models/500itrenglish30k_cnn.pkl"
        GLOVE_MODEL_DIR = "wordembeddings/glove_model_30k"
    elif choice=='2':
        MODEL_PATH = "models/hindi30k_cnn.pkl"
        GLOVE_MODEL_DIR = "embeddings/glove_model_hindi_30k"
    else:
        print("Invalid choice"); return
    if not os.path.exists(MODEL_PATH) or not os.path.exists(GLOVE_MODEL_DIR):
        print("Model or embedding path not found"); return
    classifier = CNNGenreClassifier(MODEL_PATH, GLOVE_MODEL_DIR)
    classifier.interactive_mode()

if __name__=="__main__":
    select_language_and_run()


Select language:
1. English
2. Hindi

INTERACTIVE GENRE PREDICTION MODE

Input text length: 7 words
Predicted genre: Drama (Confidence: 12.88%)

Top 5 predictions:
  1. Drama                ███░░░░░░░░░░░░░░░░░░░░░░░░░░░  12.88%
  2. Mystery              ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░   7.94%
  3. Thriller             ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░   7.75%
  4. Western              ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░   7.61%
  5. Crime                ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░   7.15%


Input text length: 36 words
Predicted genre: Crime (Confidence: 67.86%)

Top 5 predictions:
  1. Crime                ████████████████████░░░░░░░░░░  67.86%
  2. Western              ████░░░░░░░░░░░░░░░░░░░░░░░░░░  14.07%
  3. Action               ████░░░░░░░░░░░░░░░░░░░░░░░░░░  13.63%
  4. Thriller             ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░   1.66%
  5. Horror               ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░   0.83%


Input text length: 29 words
Predicted genre: Comedy (Confidence: 88.28%)

Top 5 prediction

In [2]:
#!/usr/bin/env python3
import numpy as np
import pickle
import os

class WordAnalogyTester:
    def __init__(self, glove_model_dir):
        self.embeddings, self.word_to_id, self.id_to_word, self.metadata = self.load_glove_model(glove_model_dir)
        self.embeddings = self.embeddings / np.linalg.norm(self.embeddings, axis=1, keepdims=True)
    def load_glove_model(self, model_dir):
        embeddings = np.load(f"{model_dir}/embeddings.npy")
        with open(f"{model_dir}/word_to_id.pkl", "rb") as f: word_to_id = pickle.load(f)
        with open(f"{model_dir}/id_to_word.pkl", "rb") as f: id_to_word = pickle.load(f)
        with open(f"{model_dir}/metadata.pkl", "rb") as f: metadata = pickle.load(f)
        return embeddings, word_to_id, id_to_word, metadata
    def analogy(self, a, b, c, top_k=5):
        if any(w not in self.word_to_id for w in [a,b,c]): return None
        vec = self.embeddings[self.word_to_id[a]] - self.embeddings[self.word_to_id[b]] + self.embeddings[self.word_to_id[c]]
        vec = vec / np.linalg.norm(vec)
        sims = np.dot(self.embeddings, vec)
        best = sims.argsort()[::-1]
        results = []
        for idx in best:
            word = self.id_to_word[idx]
            if word in [a,b,c]: continue
            results.append((word, float(sims[idx])))
            if len(results) >= top_k: break
        return results

def run_tests(lang):
    if lang=="english":
        GLOVE_MODEL_DIR = "wordembeddings/glove_model_30k"
        tester = WordAnalogyTester(GLOVE_MODEL_DIR)
        examples = [("king","man","woman"),("paris","france","italy"),("walking","walked","swam")]
    elif lang=="hindi":
        GLOVE_MODEL_DIR = "embeddings/glove_model_hindi_30k"
        tester = WordAnalogyTester(GLOVE_MODEL_DIR)
        examples = [("राजा","पुरुष","महिला"),("दिल्ली","भारत","पाकिस्तान"),("लड़का","आदमी","महिला")]
    else:
        return
    print(f"\n--- {lang.upper()} ANALOGY TESTS ---")
    for a,b,c in examples:
        result = tester.analogy(a,b,c,top_k=5)
        if result:
            print(f"{a} - {b} + {c} = ?")
            for r in result:
                print(f"  {r[0]} ({r[1]:.4f})")
        else:
            print(f"Words not found for {a},{b},{c}")

if __name__=="__main__":
    run_tests("english")
    run_tests("hindi")



--- ENGLISH ANALOGY TESTS ---
king - man + woman = ?
  silvergon (0.6332)
  mudbeard (0.6270)
  ahasuerus (0.6134)
  gurumes (0.6111)
  queen (0.5938)
paris - france + italy = ?
  siena (0.4658)
  travel (0.4456)
  travels (0.4335)
  venice (0.4293)
  europe (0.3896)
walking - walked + swam = ?
  street (0.4335)
  across (0.4271)
  tramline (0.4113)
  walk (0.3881)
  spots (0.3701)

--- HINDI ANALOGY TESTS ---
राजा - पुरुष + महिला = ?
  किस (0.4186)
  महाराजा (0.3901)
  साझा (0.3728)
  थूका (0.3726)
  होमपेज (0.3674)
दिल्ली - भारत + पाकिस्तान = ?
  भजनपुरा (0.5469)
  एनसीआर (0.5383)
  भोगल (0.4794)
  डीयू (0.4322)
  रंगला (0.4301)
लड़का - आदमी + महिला = ?
  मुखियाओं (0.5530)
  प्रशिक्षु (0.4798)
  आया (0.4351)
  कॉर्निया (0.4234)
  बॉयकॉट (0.4205)
