In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

In [None]:
import csv
import re
import math
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# -----------------------
# Step 1: Dataset Loader
# -----------------------
def load_dataset(filepath):
    data = []
    current_label = None
    current_text = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith("'") and line.count("','") >= 1:
                if current_label is not None:
                    data.append((current_label, " ".join(current_text)))
                parts = line.split("','", 1)
                current_label = parts[0].strip("'")
                current_text = [parts[1].rstrip("'")] if len(parts) > 1 else []
            else:
                current_text.append(line)
        if current_label is not None:
            data.append((current_label, " ".join(current_text)))
    return zip(*data)  # returns (labels, texts)

def tokenize_text(text):
    return re.findall(r"\b\w+\b", str(text).lower())

# -----------------------
# Step 2: N-gram Language Model (Better Version)
# -----------------------
class NGramLanguageModel:
    def __init__(self, n=2):
        self.n = n
        self.ngram_counts = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocab = set()

    def train(self, texts):
        for s in texts:
            tokens = ["<s>"] + tokenize_text(s) + ["</s>"]
            self.vocab.update(tokens)
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i+self.n])
                context = ngram[:-1]
                self.ngram_counts[ngram] += 1
                self.context_counts[context] += 1

    def prob(self, ngram):
        context = ngram[:-1]
        V = len(self.vocab)
        return (self.ngram_counts[ngram] + 1) / (self.context_counts[context] + V)

    def perplexity(self, texts):
        N, log_prob_sum = 0, 0.0
        V = len(self.vocab)
        for s in texts:
            tokens = ["<s>"] * (self.n - 1) + tokenize_text(s) + ["</s>"]
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i+self.n])
                p = self.prob(ngram)
                log_prob_sum += math.log(p, 2)
                N += 1
        return math.pow(2, -log_prob_sum / N)

# -----------------------
# Step 3: Extrinsic Classification
# -----------------------
def extrinsic_classification(train_texts, train_labels, test_texts, test_labels, n=1, method="nb"):
    # Join tokens into sentences
    train_sentences = [" ".join(tokenize_text(t)) for t in train_texts]
    test_sentences = [" ".join(tokenize_text(t)) for t in test_texts]

    vectorizer = TfidfVectorizer(ngram_range=(1, n), max_features=8000)
    X_train = vectorizer.fit_transform(train_sentences)
    X_test = vectorizer.transform(test_sentences)

    if method == "nb":
        clf = MultinomialNB()
    else:
        clf = LinearSVC(class_weight="balanced")

    clf.fit(X_train, train_labels)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(test_labels, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        test_labels, y_pred, average="weighted", zero_division=0
    )
    return acc, precision, recall, f1

# -----------------------
# Step 4: Run Experiments
# -----------------------
def run_experiment(train_path, test_path, lang="English", method="nb"):
    print(f"\n===== {lang} Dataset =====")
    train_labels, train_texts = load_dataset(train_path)
    test_labels, test_texts = load_dataset(test_path)

    results = []
    for n in [1, 2, 3]:
        print(f"\n=== Training {n}-gram Model ===")
        model = NGramLanguageModel(n=n)
        model.train(train_texts)
        ppl = model.perplexity(test_texts)
        print(f"Perplexity: {ppl:.2f}")

        acc, precision, recall, f1 = extrinsic_classification(
            train_texts, train_labels, test_texts, test_labels, n=n, method=method
        )
        print(f"Classification - Acc: {acc:.3f}, Prec: {precision:.3f}, Rec: {recall:.3f}, F1: {f1:.3f}")

        results.append({
            "n": n,
            "perplexity": ppl,
            "accuracy": acc,
            "precision": precision,
            "recall": recall,
            "f1": f1
        })
    return results

# -----------------------
# Step 5: Paths & Execution
# -----------------------
eng_train = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_15000.txt"
eng_test = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_test.txt"

hin_train = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_15000.txt"
hin_test = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_test.txt"

eng_results = run_experiment(eng_train, eng_test, lang="English", method="nb")
hin_results = run_experiment(hin_train, hin_test, lang="Hindi", method="nb")

print("\nFinal English Results:", eng_results)
print("\nFinal Hindi Results:", hin_results)

# Step 5: Paths & Execution
# -----------------------
eng_train = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_30000.txt"
eng_test = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_test.txt"

hin_train = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_30000.txt"
hin_test = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_test.txt"

eng_results = run_experiment(eng_train, eng_test, lang="English", method="nb")
hin_results = run_experiment(hin_train, hin_test, lang="Hindi", method="nb")

print("\nFinal English Results:", eng_results)
print("\nFinal Hindi Results:", hin_results)


===== English Dataset =====

=== Training 1-gram Model ===
Perplexity: 1546.14
Classification - Acc: 0.385, Prec: 0.345, Rec: 0.385, F1: 0.267

=== Training 2-gram Model ===
Perplexity: 3188.66
Classification - Acc: 0.397, Prec: 0.374, Rec: 0.397, F1: 0.289

=== Training 3-gram Model ===
Perplexity: 18522.60
Classification - Acc: 0.397, Prec: 0.376, Rec: 0.397, F1: 0.291

===== Hindi Dataset =====

=== Training 1-gram Model ===
Perplexity: 96.20
Classification - Acc: 0.561, Prec: 0.557, Rec: 0.561, F1: 0.501

=== Training 2-gram Model ===
Perplexity: 63.13
Classification - Acc: 0.612, Prec: 0.662, Rec: 0.612, F1: 0.570

=== Training 3-gram Model ===
Perplexity: 127.02
Classification - Acc: 0.609, Prec: 0.661, Rec: 0.609, F1: 0.568

Final English Results: [{'n': 1, 'perplexity': 1546.1389210910268, 'accuracy': 0.38467847769028873, 'precision': 0.34457367014386314, 'recall': 0.38467847769028873, 'f1': 0.26680545877976924}, {'n': 2, 'perplexity': 3188.6590699666626, 'accuracy': 0.3971456

In [None]:
import csv
import re
import math
import pickle
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# -----------------------
# Step 1: Dataset Loader
# -----------------------
def load_dataset(filepath):
    data = []
    current_label = None
    current_text = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith("'") and line.count("','") >= 1:
                if current_label is not None:
                    data.append((current_label, " ".join(current_text)))
                parts = line.split("','", 1)
                current_label = parts[0].strip("'")
                current_text = [parts[1].rstrip("'")] if len(parts) > 1 else []
            else:
                current_text.append(line)
        if current_label is not None:
            data.append((current_label, " ".join(current_text)))
    return zip(*data)  # returns (labels, texts)

def tokenize_text(text):
    return re.findall(r"\b\w+\b", str(text).lower())

# -----------------------
# Step 2: N-gram Language Model
# -----------------------
class NGramLanguageModel:
    def __init__(self, n=2):
        self.n = n
        self.ngram_counts = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocab = set()

    def train(self, texts):
        for s in texts:
            tokens = ["<s>"] + tokenize_text(s) + ["</s>"]
            self.vocab.update(tokens)
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i+self.n])
                context = ngram[:-1]
                self.ngram_counts[ngram] += 1
                self.context_counts[context] += 1

    def prob(self, ngram):
        context = ngram[:-1]
        V = len(self.vocab)
        return (self.ngram_counts[ngram] + 1) / (self.context_counts[context] + V)

    def perplexity(self, texts):
        N, log_prob_sum = 0, 0.0
        for s in texts:
            tokens = ["<s>"] * (self.n - 1) + tokenize_text(s) + ["</s>"]
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i+self.n])
                p = self.prob(ngram)
                log_prob_sum += math.log(p, 2)
                N += 1
        return math.pow(2, -log_prob_sum / N)

# -----------------------
# Step 3: Save/Load Models
# -----------------------
def save_model(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

def load_model(filepath):
    with open(filepath, "rb") as f:
        return pickle.load(f)

# -----------------------
# Step 4: Train & Save Bigram + Classifier
# -----------------------
def train_and_save(train_path, test_path, lang, size, method="nb"):
    print(f"\n===== Training {lang} {size} Dataset (Bigram) =====")
    train_labels, train_texts = load_dataset(train_path)
    test_labels, test_texts = load_dataset(test_path)

    # Train N-gram LM
    lm = NGramLanguageModel(n=2)
    lm.train(train_texts)
    ppl = lm.perplexity(test_texts)

    # Extrinsic classification
    train_sentences = [" ".join(tokenize_text(t)) for t in train_texts]
    test_sentences = [" ".join(tokenize_text(t)) for t in test_texts]

    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=8000)
    X_train = vectorizer.fit_transform(train_sentences)
    X_test = vectorizer.transform(test_sentences)

    if method == "nb":
        clf = MultinomialNB()
    else:
        clf = LinearSVC(class_weight="balanced")

    clf.fit(X_train, train_labels)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(test_labels, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        test_labels, y_pred, average="weighted", zero_division=0
    )

    print(f"Perplexity: {ppl:.2f}")
    print(f"Classification - Acc: {acc:.3f}, Prec: {precision:.3f}, Rec: {recall:.3f}, F1: {f1:.3f}")

    # ✅ Save into your folder
    save_path = f"/content/drive/MyDrive/NLP_Assignment1/Bigram_Models/ngram_{lang.lower()}_{size}.pkl"
    save_model({
        "lm": lm,
        "vectorizer": vectorizer,
        "classifier": clf,
        "labels": sorted(list(set(train_labels)))
    }, save_path)
    print(f"Model saved at {save_path}")

    return {
        "lang": lang, "size": size,
        "perplexity": ppl, "accuracy": acc,
        "precision": precision, "recall": recall, "f1": f1
    }


# -----------------------
# Step 5: Predict Genre (Top-k)
# -----------------------
def predict_genre(paragraph, model_path, top_k=3):
    saved = load_model(model_path)
    vectorizer = saved["vectorizer"]
    clf = saved["classifier"]

    tokens = " ".join(tokenize_text(paragraph))
    X = vectorizer.transform([tokens])

    if hasattr(clf, "predict_proba"):  # works for Naive Bayes
        probs = clf.predict_proba(X)[0]
        classes = clf.classes_
    else:  # fallback for models without predict_proba (like LinearSVC)
        if hasattr(clf, "decision_function"):
            scores = clf.decision_function(X)
            if scores.ndim == 1:
                probs = np.exp(scores) / np.sum(np.exp(scores))
            else:
                probs = np.exp(scores[0]) / np.sum(np.exp(scores[0]))
            classes = clf.classes_
        else:
            return [(clf.predict(X)[0], 1.0)]  # single prediction only

    top_idx = np.argsort(probs)[::-1][:top_k]
    return [(classes[i], float(probs[i])) for i in top_idx]

# -----------------------
# Example Execution
# -----------------------
if __name__ == "__main__":
    # Paths
    eng_small = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_2500.txt"
    eng_med   = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_15000.txt"
    eng_large = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_30000.txt"
    eng_test  = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_test.txt"

    hin_small = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_2500.txt"
    hin_med   = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_15000.txt"
    hin_large = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_30000.txt"
    hin_test  = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_test.txt"

    results = []
    results.append(train_and_save(eng_small, eng_test, "English", "small"))
    results.append(train_and_save(eng_med, eng_test, "English", "medium"))
    results.append(train_and_save(eng_large, eng_test, "English", "large"))

    results.append(train_and_save(hin_small, hin_test, "Hindi", "small"))
    results.append(train_and_save(hin_med, hin_test, "Hindi", "medium"))
    results.append(train_and_save(hin_large, hin_test, "Hindi", "large"))

    print("\nFinal Results:", results)




===== Training English small Dataset (Bigram) =====
Perplexity: 5036.33
Classification - Acc: 0.218, Prec: 0.138, Rec: 0.218, F1: 0.096
Model saved at /content/drive/MyDrive/NLP_Assignment1/Bigram_Models/ngram_english_small.pkl

===== Training English medium Dataset (Bigram) =====
Perplexity: 3188.66
Classification - Acc: 0.397, Prec: 0.374, Rec: 0.397, F1: 0.289
Model saved at /content/drive/MyDrive/NLP_Assignment1/Bigram_Models/ngram_english_medium.pkl

===== Training English large Dataset (Bigram) =====
Perplexity: 3366.53
Classification - Acc: 0.415, Prec: 0.338, Rec: 0.415, F1: 0.317
Model saved at /content/drive/MyDrive/NLP_Assignment1/Bigram_Models/ngram_english_large.pkl

===== Training Hindi small Dataset (Bigram) =====
Perplexity: 99.37
Classification - Acc: 0.298, Prec: 0.544, Rec: 0.298, F1: 0.282
Model saved at /content/drive/MyDrive/NLP_Assignment1/Bigram_Models/ngram_hindi_small.pkl

===== Training Hindi medium Dataset (Bigram) =====
Perplexity: 63.13
Classification - A

In [None]:
para = "प्रधानमंत्री नरेंद्र मोदी ने चंद्रयान-3 मिशन में शामिल इसरो के वैज्ञानिकों को संबोधित करते हुए कहा है कि चंद्रमा पर जिस जगह विक्रम लैंडर उतरा उसे शिवशक्ति पॉइंट कहा जाएगा। उन्होंने कहा, स्पेस मिशन्स के टचडाउन पॉइंट को नाम दिए जाने की वैज्ञानिक परंपरा है। चंद्रमा के जिस स्थान पर चंद्रयान-3 उतरा...भारत ने भी उसके नामकरण का फैसला किया है।"
print(predict_genre(para, "/content/drive/MyDrive/NLP_Assignment1/Bigram_Models/ngram_hindi_large.pkl", top_k=3))


[(np.str_('[national, technology]'), 0.9902949665468653), (np.str_('[technology]'), 0.00807527051060882), (np.str_('[politics, national]'), 0.0005170258991057897)]


In [None]:
import re
import math
import pickle
import numpy as np
from collections import defaultdict, Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.svm import LinearSVC
import torch

# -----------------------
# Device setup
# -----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------
# Dataset loader
# -----------------------
def load_dataset(filepath):
    data = []
    current_label = None
    current_text = []

    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith("'") and line.count("','") >= 1:
                if current_label is not None:
                    data.append((current_label, " ".join(current_text)))
                parts = line.split("','", 1)
                current_label = parts[0].strip("'")
                current_text = [parts[1].rstrip("'")] if len(parts) > 1 else []
            else:
                current_text.append(line)
        if current_label is not None:
            data.append((current_label, " ".join(current_text)))

    labels, texts = zip(*data)
    return list(labels), list(texts)

# Convert comma-separated genres to list
def parse_labels(label_str):
    return [l.strip() for l in label_str.split(",")]

# -----------------------
# Tokenization
# -----------------------
def tokenize_text(text):
    return re.findall(r"\b\w+\b", str(text).lower())

# -----------------------
# Vocabulary + Co-occurrence
# -----------------------
def build_vocab(texts, min_count=5):
    counter = Counter()
    for text in texts:
        counter.update(tokenize_text(text))
    vocab = [w for w, c in counter.items() if c >= min_count]
    print(f"Vocab size after pruning (min_count={min_count}): {len(vocab)}")
    return vocab

def build_cooccurrence(texts, vocab, window_size=5, cooc_min=2):
    word_to_id = {w: i for i, w in enumerate(vocab)}
    cooc = defaultdict(float)
    for text in texts:
        tokens = tokenize_text(text)
        for i, w in enumerate(tokens):
            wi = word_to_id.get(w)
            if wi is None: continue
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            for j in range(start, end):
                if i == j: continue
                wj = word_to_id.get(tokens[j])
                if wj is None: continue
                cooc[(wi, wj)] += 1.0
    # prune
    cooc = {k: v for k, v in cooc.items() if v >= cooc_min}
    print(f"Co-occurrence pairs after pruning (min_cooc={cooc_min}): {len(cooc)}")
    return cooc, word_to_id

# -----------------------
# GloVe model (PyTorch)
# -----------------------
class GloVeTorch(torch.nn.Module):
    def __init__(self, vocab_size, vector_size=100, xmax=100, alpha=0.75):
        super().__init__()
        self.W = torch.nn.Parameter(torch.randn(vocab_size, vector_size) / math.sqrt(vector_size))
        self.W_tilde = torch.nn.Parameter(torch.randn(vocab_size, vector_size) / math.sqrt(vector_size))
        self.b = torch.nn.Parameter(torch.zeros(vocab_size))
        self.b_tilde = torch.nn.Parameter(torch.zeros(vocab_size))
        self.xmax = xmax
        self.alpha = alpha

    def forward(self, i_idx, j_idx, x_ij):
        w_i = self.W[i_idx]
        w_j = self.W_tilde[j_idx]
        b_i = self.b[i_idx]
        b_j = self.b_tilde[j_idx]
        pred = torch.sum(w_i * w_j, dim=1) + b_i + b_j
        log_x = torch.log(x_ij)
        weight = torch.where(x_ij < self.xmax, (x_ij / self.xmax) ** self.alpha, torch.ones_like(x_ij))
        loss = weight * (pred - log_x) ** 2
        return torch.mean(loss)

# -----------------------
# Train GloVe with GPU
# -----------------------
def train_glove_pytorch(cooc, vocab_size, vector_size=100, epochs=20, lr=0.05, batch_size=50000):
    model = GloVeTorch(vocab_size, vector_size).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    i_idx = torch.tensor([k[0] for k in cooc.keys()], device=device, dtype=torch.long)
    j_idx = torch.tensor([k[1] for k in cooc.keys()], device=device, dtype=torch.long)
    x_ij = torch.tensor([v for v in cooc.values()], device=device, dtype=torch.float)

    num_pairs = len(cooc)
    for epoch in range(epochs):
        perm = torch.randperm(num_pairs)
        total_loss = 0.0
        for start in range(0, num_pairs, batch_size):
            idx = perm[start:start+batch_size]
            optimizer.zero_grad()
            loss = model(i_idx[idx], j_idx[idx], x_ij[idx])
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(idx)
        avg_loss = total_loss / num_pairs
        perplexity = math.exp(min(avg_loss, 700))  # prevent overflow
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.2f}, Pseudo-Perplexity: {perplexity:.2f}")
    embeddings = (model.W + model.W_tilde).detach()
    return embeddings

# -----------------------
# Sentence embeddings on GPU
# -----------------------
def sentence_embeddings_batch(sentences, word_to_id, embeddings_tensor):
    vecs = []
    for sent in sentences:
        tokens = tokenize_text(sent)
        ids = [word_to_id[w] for w in tokens if w in word_to_id]
        if ids:
            vec = embeddings_tensor[ids].mean(dim=0)
        else:
            vec = torch.zeros(embeddings_tensor.shape[1], device=embeddings_tensor.device)
        vecs.append(vec)
    return torch.stack(vecs)

# -----------------------
# Save / Load
# -----------------------
def save_model(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

def load_model(filepath):
    with open(filepath, "rb") as f:
        return pickle.load(f)

# -----------------------
# Train + Classifier
# -----------------------
def train_and_save_glove(train_path, test_path, lang, size, top_k=3):
    print(f"\n===== Training {lang} {size} Dataset (GloVe PyTorch) =====")
    train_labels, train_texts = load_dataset(train_path)
    train_labels = [parse_labels(lbl) for lbl in train_labels]
    test_labels, test_texts = load_dataset(test_path)
    test_labels = [parse_labels(lbl) for lbl in test_labels]

    # Set thresholds and vector size
    if size == "small":
        min_count, min_cooc, vector_size, epochs = 2, 1, 50, 50
    elif size == "medium":
        min_count, min_cooc, vector_size, epochs = 5, 2, 100, 50
    else:
        min_count, min_cooc, vector_size, epochs = 10, 5, 200, 30

    vocab = build_vocab(train_texts, min_count)
    cooc, word_to_id = build_cooccurrence(train_texts, vocab, cooc_min=min_cooc)

    embeddings = train_glove_pytorch(cooc, len(vocab), vector_size, epochs=epochs, lr=0.05)
    embeddings_tensor = embeddings.to(device)

    # Sentence embeddings
    X_train = sentence_embeddings_batch(train_texts, word_to_id, embeddings_tensor).cpu().numpy()
    X_test = sentence_embeddings_batch(test_texts, word_to_id, embeddings_tensor).cpu().numpy()

    # Flatten multi-genre labels to first label for classifier (LinearSVC is single-label)
    train_labels_single = [lbls[0] for lbls in train_labels]

    clf = LinearSVC(class_weight="balanced")
    clf.fit(X_train, train_labels_single)

    # -----------------------
    # Multi-genre-aware evaluation
    # -----------------------
    scores = clf.decision_function(X_test)
    classes = clf.classes_
    y_pred_topk = []
    for s in scores:
        idx = s.argsort()[::-1][:top_k]
        y_pred_topk.append([classes[i] for i in idx])

    # Accuracy: correct if any predicted genre matches any true genre
    correct = 0
    for pred_list, true_list in zip(y_pred_topk, test_labels):
        if any(p in true_list for p in pred_list):
            correct += 1
    acc_topk = correct / len(test_labels)
    print(f"Top-{top_k} Multi-genre-aware Accuracy: {acc_topk:.3f}")

    # Save
    # save_path = f"glove_{lang.lower()}_{size}.pkl"
    # save_model({
    #     "embeddings": embeddings.cpu().numpy(),
    #     "word_to_id": word_to_id,
    #     "classifier": clf,
    #     "classes": classes
    # }, save_path)
    # print(f"Model saved at {save_path}")

    SAVE_DIR = "/content/drive/MyDrive/NLP_Assignment1/Glove_Models"
    os.makedirs(SAVE_DIR, exist_ok=True)

    save_path = os.path.join(SAVE_DIR, f"glove_{lang.lower()}_{size}.pkl")
    save_model(
    {
        "embeddings": embeddings.cpu().numpy(),
        "word_to_id": word_to_id,
        "classifier": clf,
        "classes": clf.classes_
    },
    save_path
    )
    print(f"Model saved at {save_path}")
    return {"lang": lang, "size": size, "accuracy_topk": acc_topk}

# -----------------------
# Predict
# -----------------------
def predict_genre_glove(paragraph, model_path, top_k=3):
    saved = load_model(model_path)
    embeddings = torch.tensor(saved["embeddings"], device=device, dtype=torch.float)
    word_to_id = saved["word_to_id"]
    clf = saved["classifier"]
    classes = saved["classes"]
    X = sentence_embeddings_batch([paragraph], word_to_id, embeddings).cpu().numpy()

    # Top-k predictions
    scores = clf.decision_function(X)[0]
    idx = scores.argsort()[::-1][:top_k]
    top_genres = [classes[i] for i in idx]
    return top_genres

# -----------------------
# Example execution
# -----------------------
if __name__ == "__main__":
    # Paths
    eng_small = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_2500.txt"
    eng_med   = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_15000.txt"
    eng_large = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_30000.txt"
    eng_test  = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_test.txt"

    hin_small = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_2500.txt"
    hin_med   = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_15000.txt"
    hin_large = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_30000.txt"
    hin_test  = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_test.txt"

    results = []

    # Train Hindi datasets (you can enable English similarly)
    results.append(train_and_save_glove(hin_small, hin_test, "Hindi", "small"))
    results.append(train_and_save_glove(hin_med, hin_test, "Hindi", "medium"))
    results.append(train_and_save_glove(hin_large, hin_test, "Hindi", "large"))

    print("\nFinal Results:")
    for r in results:
        print(r)


Using device: cuda

===== Training Hindi small Dataset (GloVe PyTorch) =====
Vocab size after pruning (min_count=2): 1977
Co-occurrence pairs after pruning (min_cooc=1): 149580
Epoch 1/50, Loss: 1.24, Pseudo-Perplexity: 3.44
Epoch 2/50, Loss: 0.75, Pseudo-Perplexity: 2.11
Epoch 3/50, Loss: 0.25, Pseudo-Perplexity: 1.28
Epoch 4/50, Loss: 0.22, Pseudo-Perplexity: 1.24
Epoch 5/50, Loss: 0.09, Pseudo-Perplexity: 1.10
Epoch 6/50, Loss: 0.12, Pseudo-Perplexity: 1.13
Epoch 7/50, Loss: 0.10, Pseudo-Perplexity: 1.10
Epoch 8/50, Loss: 0.06, Pseudo-Perplexity: 1.06
Epoch 9/50, Loss: 0.06, Pseudo-Perplexity: 1.06
Epoch 10/50, Loss: 0.05, Pseudo-Perplexity: 1.05
Epoch 11/50, Loss: 0.04, Pseudo-Perplexity: 1.05
Epoch 12/50, Loss: 0.04, Pseudo-Perplexity: 1.04
Epoch 13/50, Loss: 0.03, Pseudo-Perplexity: 1.03
Epoch 14/50, Loss: 0.03, Pseudo-Perplexity: 1.03
Epoch 15/50, Loss: 0.03, Pseudo-Perplexity: 1.03
Epoch 16/50, Loss: 0.02, Pseudo-Perplexity: 1.02
Epoch 17/50, Loss: 0.02, Pseudo-Perplexity: 1.02

In [None]:
# Example prediction
para = "प्रधानमंत्री नरेंद्र मोदी ने चंद्रयान-3 मिशन में शामिल इसरो के वैज्ञानिकों को संबोधित किया।"
print("Predicted genres:", predict_genre_glove(para, "/content/drive/MyDrive/NLP_Assignment1/Glove_Models/glove_hindi_large.pkl", top_k=3))

Predicted genres: [np.str_('[national'), np.str_('[entertainment'), np.str_('[national]')]


In [None]:
import re
import math
import pickle
import numpy as np
from collections import defaultdict, Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.svm import LinearSVC
import torch
import os

# -----------------------
# Device setup
# -----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------
# Dataset loader
# -----------------------
def load_dataset(filepath):
    data = []
    current_label = None
    current_text = []

    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith("'") and line.count("','") >= 1:
                if current_label is not None:
                    data.append((current_label, " ".join(current_text)))
                parts = line.split("','", 1)
                current_label = parts[0].strip("'")
                current_text = [parts[1].rstrip("'")] if len(parts) > 1 else []
            else:
                current_text.append(line)
        if current_label is not None:
            data.append((current_label, " ".join(current_text)))

    labels, texts = zip(*data)
    return list(labels), list(texts)

# Convert comma-separated genres to list
def parse_labels(label_str):
    return [l.strip() for l in label_str.split(",")]

# -----------------------
# Tokenization
# -----------------------
def tokenize_text(text):
    return re.findall(r"\b\w+\b", str(text).lower())

# -----------------------
# Vocabulary + Co-occurrence
# -----------------------
def build_vocab(texts, min_count=5):
    counter = Counter()
    for text in texts:
        counter.update(tokenize_text(text))
    vocab = [w for w, c in counter.items() if c >= min_count]
    print(f"Vocab size after pruning (min_count={min_count}): {len(vocab)}")
    return vocab

def build_cooccurrence(texts, vocab, window_size=5, cooc_min=2):
    word_to_id = {w: i for i, w in enumerate(vocab)}
    cooc = defaultdict(float)
    for text in texts:
        tokens = tokenize_text(text)
        for i, w in enumerate(tokens):
            wi = word_to_id.get(w)
            if wi is None: continue
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            for j in range(start, end):
                if i == j: continue
                wj = word_to_id.get(tokens[j])
                if wj is None: continue
                cooc[(wi, wj)] += 1.0
    # prune
    cooc = {k: v for k, v in cooc.items() if v >= cooc_min}
    print(f"Co-occurrence pairs after pruning (min_cooc={cooc_min}): {len(cooc)}")
    return cooc, word_to_id

# -----------------------
# GloVe model (PyTorch)
# -----------------------
class GloVeTorch(torch.nn.Module):
    def __init__(self, vocab_size, vector_size=100, xmax=100, alpha=0.75):
        super().__init__()
        self.W = torch.nn.Parameter(torch.randn(vocab_size, vector_size) / math.sqrt(vector_size))
        self.W_tilde = torch.nn.Parameter(torch.randn(vocab_size, vector_size) / math.sqrt(vector_size))
        self.b = torch.nn.Parameter(torch.zeros(vocab_size))
        self.b_tilde = torch.nn.Parameter(torch.zeros(vocab_size))
        self.xmax = xmax
        self.alpha = alpha

    def forward(self, i_idx, j_idx, x_ij):
        w_i = self.W[i_idx]
        w_j = self.W_tilde[j_idx]
        b_i = self.b[i_idx]
        b_j = self.b_tilde[j_idx]
        pred = torch.sum(w_i * w_j, dim=1) + b_i + b_j
        log_x = torch.log(x_ij)
        weight = torch.where(x_ij < self.xmax, (x_ij / self.xmax) ** self.alpha, torch.ones_like(x_ij))
        loss = weight * (pred - log_x) ** 2
        return torch.mean(loss)

# -----------------------
# Train GloVe with GPU
# -----------------------
def train_glove_pytorch(cooc, vocab_size, vector_size=100, epochs=20, lr=0.05, batch_size=50000):
    model = GloVeTorch(vocab_size, vector_size).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    i_idx = torch.tensor([k[0] for k in cooc.keys()], device=device, dtype=torch.long)
    j_idx = torch.tensor([k[1] for k in cooc.keys()], device=device, dtype=torch.long)
    x_ij = torch.tensor([v for v in cooc.values()], device=device, dtype=torch.float)

    num_pairs = len(cooc)
    for epoch in range(epochs):
        perm = torch.randperm(num_pairs)
        total_loss = 0.0
        for start in range(0, num_pairs, batch_size):
            idx = perm[start:start+batch_size]
            optimizer.zero_grad()
            loss = model(i_idx[idx], j_idx[idx], x_ij[idx])
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(idx)
        avg_loss = total_loss / num_pairs
        perplexity = math.exp(min(avg_loss, 700))  # prevent overflow
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.2f}, Pseudo-Perplexity: {perplexity:.2f}")
    embeddings = (model.W + model.W_tilde).detach()
    return embeddings

# -----------------------
# Sentence embeddings on GPU
# -----------------------
def sentence_embeddings_batch(sentences, word_to_id, embeddings_tensor):
    vecs = []
    for sent in sentences:
        tokens = tokenize_text(sent)
        ids = [word_to_id[w] for w in tokens if w in word_to_id]
        if ids:
            vec = embeddings_tensor[ids].mean(dim=0)
        else:
            vec = torch.zeros(embeddings_tensor.shape[1], device=embeddings_tensor.device)
        vecs.append(vec)
    return torch.stack(vecs)

# -----------------------
# Save / Load
# -----------------------
def save_model(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

def load_model(filepath):
    with open(filepath, "rb") as f:
        return pickle.load(f)

# -----------------------
# Train + Classifier
# -----------------------
def train_and_save_glove(train_path, test_path, lang, size, top_k=3):
    print(f"\n===== Training {lang} {size} Dataset (GloVe PyTorch) =====")
    train_labels, train_texts = load_dataset(train_path)
    train_labels = [parse_labels(lbl) for lbl in train_labels]
    test_labels, test_texts = load_dataset(test_path)
    test_labels = [parse_labels(lbl) for lbl in test_labels]

    # Set thresholds and vector size
    if size == "small":
        min_count, min_cooc, vector_size, epochs = 2, 1, 50, 50
    elif size == "medium":
        min_count, min_cooc, vector_size, epochs = 5, 2, 100, 50
    else:
        min_count, min_cooc, vector_size, epochs = 10, 5, 200, 50


    vocab = build_vocab(train_texts, min_count)
    cooc, word_to_id = build_cooccurrence(train_texts, vocab, cooc_min=min_cooc)

    embeddings = train_glove_pytorch(cooc, len(vocab), vector_size, epochs=epochs, lr=0.05)
    embeddings_tensor = embeddings.to(device)

    # Sentence embeddings
    X_train = sentence_embeddings_batch(train_texts, word_to_id, embeddings_tensor).cpu().numpy()
    X_test = sentence_embeddings_batch(test_texts, word_to_id, embeddings_tensor).cpu().numpy()

    # Flatten multi-genre labels to first label for classifier (LinearSVC is single-label)
    train_labels_single = [lbls[0] for lbls in train_labels]

    clf = LinearSVC(class_weight="balanced")
    clf.fit(X_train, train_labels_single)

    # -----------------------
    # Multi-genre-aware evaluation
    # -----------------------
    scores = clf.decision_function(X_test)
    classes = clf.classes_
    y_pred_topk = []
    for s in scores:
        idx = s.argsort()[::-1][:top_k]
        y_pred_topk.append([classes[i] for i in idx])

    # Accuracy: correct if any predicted genre matches any true genre
    correct = 0
    for pred_list, true_list in zip(y_pred_topk, test_labels):
        if any(p in true_list for p in pred_list):
            correct += 1
    acc_topk = correct / len(test_labels)
    print(f"Top-{top_k} Multi-genre-aware Accuracy: {acc_topk:.3f}")

    # Save
    # save_path = f"glove_{lang.lower()}_{size}.pkl"
    # save_model({
    #     "embeddings": embeddings.cpu().numpy(),
    #     "word_to_id": word_to_id,
    #     "classifier": clf,
    #     "classes": classes
    # }, save_path)
    # print(f"Model saved at {save_path}")

    SAVE_DIR = "/content/drive/MyDrive/NLP_Assignment1/Glove_Models"
    os.makedirs(SAVE_DIR, exist_ok=True)

    save_path = os.path.join(SAVE_DIR, f"glove_{lang.lower()}_{size}.pkl")
    save_model(
    {
        "embeddings": embeddings.cpu().numpy(),
        "word_to_id": word_to_id,
        "classifier": clf,
        "classes": clf.classes_
    },
    save_path
    )
    print(f"Model saved at {save_path}")
    return {"lang": lang, "size": size, "accuracy_topk": acc_topk}

# -----------------------
# Predict
# -----------------------
def predict_genre_glove(paragraph, model_path, top_k=3):
    saved = load_model(model_path)
    embeddings = torch.tensor(saved["embeddings"], device=device, dtype=torch.float)
    word_to_id = saved["word_to_id"]
    clf = saved["classifier"]
    classes = saved["classes"]
    X = sentence_embeddings_batch([paragraph], word_to_id, embeddings).cpu().numpy()

    # Top-k predictions
    scores = clf.decision_function(X)[0]
    idx = scores.argsort()[::-1][:top_k]
    top_genres = [classes[i] for i in idx]
    return top_genres

# -----------------------
# Example execution
# -----------------------
if __name__ == "__main__":
    # Paths
    eng_small = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_2500.txt"
    eng_med   = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_15000.txt"
    eng_large = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_30000.txt"
    eng_test  = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/english/english_test.txt"

    hin_small = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_2500.txt"
    hin_med   = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_15000.txt"
    hin_large = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_30000.txt"
    hin_test  = "/content/drive/MyDrive/NLP_Assignment1/datasets 2/hindi/hindi_test.txt"

    results = []

    results.append(train_and_save_glove(eng_small, eng_test, "English", "small"))
    results.append(train_and_save_glove(eng_med, eng_test, "English", "medium"))
    results.append(train_and_save_glove(eng_large, eng_test, "English", "large"))

    # Train Hindi datasets (you can enable English similarly)
    # results.append(train_and_save_glove(hin_small, hin_test, "Hindi", "small"))
    # results.append(train_and_save_glove(hin_med, hin_test, "Hindi", "medium"))
    # results.append(train_and_save_glove(hin_large, hin_test, "Hindi", "large"))

    print("\nFinal Results:")
    for r in results:
        print(r)

    print("Predicted genres:", predict_genre_glove(para, "/content/drive/MyDrive/NLP_Assignment1/Glove_Models/glove_hindi_large.pkl", top_k=3))

Using device: cuda

===== Training English small Dataset (GloVe PyTorch) =====
Vocab size after pruning (min_count=2): 17758
Co-occurrence pairs after pruning (min_cooc=1): 2174061
Epoch 1/50, Loss: 0.07, Pseudo-Perplexity: 1.07
Epoch 2/50, Loss: 0.02, Pseudo-Perplexity: 1.02
Epoch 3/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 4/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 5/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 6/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 7/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 8/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 9/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 10/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 11/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 12/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 13/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 14/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 15/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 16/50, Loss: 0.01, Pseudo-Perplexity: 1.01
Epoch 17/50, Loss: 0.01, Pseudo-Perplexity: 

In [None]:
 # Example prediction
para = "toy story"
print("Predicted genres:", predict_genre_glove(para, "glove_english_medium.pkl", top_k=3))

Predicted genres: [np.str_('16\xa0mm film'), np.str_('operetta'), np.str_('sci-fi drama')]
