In [None]:
import sys
import numpy as np
import torch
from pathlib import Path

# Print versions and GPU info
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"Torch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU Devices: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")

In [None]:
BASE_DIR = Path("/kaggle/working")
print(f"Working directory: {BASE_DIR}")

# Dataset directory (read-only, replace with actual folder name from sidebar)
DATASET_NAME = "data-for-nlp-major"  # <-- CHANGE THIS
DATA_DIR = Path(f"/kaggle/input/{DATASET_NAME}")

# Checkpoint directory (writeable)
CHECKPOINT_DIR = BASE_DIR / "checkpoints"
CHECKPOINT_DIR.mkdir(exist_ok=True)

# File paths
data_file = DATA_DIR / "final.json"
checkpoint_path = CHECKPOINT_DIR / "w2v_checkpoint_gpu.npz"

# Output paths
print(f"Data file: {data_file}")
print(f"Checkpoint path: {checkpoint_path}")

In [None]:
def generate_training_data(data, window_size=5):
    pairs = []
    for entry in data:
        sentence = entry["sentence"]
        sentence_length = len(sentence)
        for i, center in enumerate(sentence):
            if center not in word2idx:
                continue
            for j in range(max(0, i - window_size), min(sentence_length, i + window_size + 1)):
                if i != j and sentence[j] in word2idx:
                    pairs.append((word2idx[center], word2idx[sentence[j]]))
    return pairs

training_pairs = generate_training_data(data, window_size=5)
print(f"Number of training pairs: {len(training_pairs)}")

In [None]:
import torch
import os

print("Using device:", torch.cuda.current_device())

# --- Config ---
embedding_dim = 100
learning_rate = 0.01
num_epochs = 30
negative_samples = 8
batch_size = 1024

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# --- Data ---
training_pairs = torch.tensor(training_pairs, dtype=torch.int32)
vocab_size = len(word2idx)

# --- Load or initialize checkpoint ---
start_epoch = 0
scale = 0.01
if os.path.exists(checkpoint_path):
    ckpt = torch.load(checkpoint_path)
    W_in = ckpt["W_in"].float()
    W_out = ckpt["W_out"].float()
    start_epoch = int(ckpt["epoch"])
    # Load optimizer states
    m_in = ckpt["m_in"]
    v_in = ckpt["v_in"]
    m_out = ckpt["m_out"]
    v_out = ckpt["v_out"]
    t_step = int(ckpt["t_step"])
    print(f"Resumed from epoch {start_epoch}")

    if torch.isnan(W_in).any() or torch.isnan(W_out).any():
        print("Checkpoint contains NaNs — resetting weights.")
        W_in = torch.FloatTensor(vocab_size, embedding_dim).uniform_(-scale, scale)
        W_out = torch.FloatTensor(embedding_dim, vocab_size).uniform_(-scale, scale)
        start_epoch = 0
else:
    print("No checkpoint found. Initializing weights.")
    W_in = torch.FloatTensor(vocab_size, embedding_dim).uniform_(-scale, scale)
    W_out = torch.FloatTensor(embedding_dim, vocab_size).uniform_(-scale, scale)
    # Initialize optimizer states
    m_in = torch.zeros_like(W_in)
    v_in = torch.zeros_like(W_in)
    m_out = torch.zeros_like(W_out)
    v_out = torch.zeros_like(W_out)
    t_step = 0

# --- Adam Optimizer Hyperparameters ---
beta1 = 0.9
beta2 = 0.999
eps = 1e-8

# --- Helper Functions ---
def sigmoid(x):
    return 1 / (1 + torch.exp(-torch.clamp(x, -10, 10)))

def normalize_rows(mat):
    norm = torch.norm(mat, dim=1, keepdim=True)
    return mat / (norm + 1e-8)

# Build unigram distribution
word_freq = torch.tensor([word_counts[idx2word[i]] for i in range(vocab_size)], dtype=torch.float64)
unigram_dist = word_freq ** 0.75
unigram_dist /= unigram_dist.sum()

def get_negative_samples(batch_size, K):
    return torch.multinomial(unigram_dist, num_samples=K, replacement=True)

# --- Training Loop ---
ema_loss = None
ema_decay = 0.95

for epoch in range(start_epoch, num_epochs):
    training_pairs = training_pairs[torch.randperm(training_pairs.size(0))]
    total_loss = 0.0
    num_batches = 0

    for i in range(0, len(training_pairs), batch_size):
        t_step += 1
        batch = training_pairs[i:i + batch_size]
        center_idxs = batch[:, 0]
        target_idxs = batch[:, 1]

        # Forward pass
        v_c = W_in[center_idxs]
        u_o = W_out[:, target_idxs]

        # Positive samples
        score_pos = torch.clamp(torch.sum(v_c * u_o.T, dim=1), -10, 10)
        pred_pos = sigmoid(score_pos)

        # Negative samples
        neg_ids = get_negative_samples(len(center_idxs), negative_samples)
        u_k = W_out[:, neg_ids.T].transpose(2, 1, 0)
        score_neg = torch.clamp(torch.sum(v_c[:, None, :] * u_k, dim=2), -10, 10)
        pred_neg = sigmoid(-score_neg)

        # Loss calculation
        pred_pos = torch.clamp(pred_pos, 1e-7, 1 - 1e-7)
        pred_neg = torch.clamp(pred_neg, 1e-7, 1 - 1e-7)
        loss = -torch.log(pred_pos).mean() - torch.log(pred_neg).mean()
        total_loss += float(loss)
        num_batches += 1

        # Update EMA loss
        ema_loss = loss if ema_loss is None else ema_decay * ema_loss + (1 - ema_decay) * loss

        # --- W_in Gradients ---
        grad_pos = (pred_pos - 1).reshape(-1, 1) * u_o.T
        grad_neg = (1 - pred_neg)[:, :, None] * u_k
        grad_in = grad_pos + grad_neg.mean(dim=1)
        grad_in = torch.clamp(grad_in, -1, 1)  # Tighter clipping

        # Adam update for W_in
        m_in[center_idxs] = beta1 * m_in[center_idxs] + (1 - beta1) * grad_in
        v_in[center_idxs] = beta2 * v_in[center_idxs] + (1 - beta2) * (grad_in ** 2)
        m_hat = m_in[center_idxs] / (1 - beta1 ** t_step)
        v_hat = v_in[center_idxs] / (1 - beta2 ** t_step)
        W_in[center_idxs] -= learning_rate * m_hat / (torch.sqrt(v_hat) + eps)

        # --- W_out Gradients (FIXED) ---
        update_W_out = torch.zeros_like(W_out)
        # Raw gradients without learning rate
        update_W_out.scatter_add_(1, target_idxs.unsqueeze(0), (pred_pos - 1) * v_c.T)
        for k in range(negative_samples):
            update_W_out.scatter_add_(1, neg_ids[:, k].unsqueeze(0), grad_neg[:, k].T)

        # Adam update for W_out
        updated_cols = torch.unique(torch.cat((target_idxs, neg_ids.flatten())))
        m_out[:, updated_cols] = beta1 * m_out[:, updated_cols] + (1 - beta1) * update_W_out[:, updated_cols]
        v_out[:, updated_cols] = beta2 * v_out[:, updated_cols] + (1 - beta2) * (update_W_out[:, updated_cols] ** 2)
        m_hat_out = m_out[:, updated_cols] / (1 - beta1 ** t_step)
        v_hat_out = v_out[:, updated_cols] / (1 - beta2 ** t_step)
        W_out[:, updated_cols] -= learning_rate * m_hat_out / (torch.sqrt(v_hat_out) + eps)

    # End of epoch
    avg_loss = total_loss / num_batches
    grad_norm = torch.norm(grad_in, dim=1).mean()
    print(f"Epoch {epoch+1}, Avg Loss (EMA): {ema_loss:.4f}, Batch Avg Loss: {avg_loss:.4f}, Max Grad: {torch.max(torch.abs(grad_in)):.4f}, Grad Norm: {grad_norm:.4f}")

    # Save checkpoint with optimizer states
    torch.save({
        "W_in": W_in,
        "W_out": W_out,
        "m_in": m_in,
        "v_in": v_in,
        "m_out": m_out,
        "v_out": v_out,
        "t_step": t_step,
        "epoch": epoch+1
    }, checkpoint_path)

    # Mild regularization
    W_out = normalize_rows(W_out.T).T * 0.99 + W_out * 0.01

    # NaN check
    if torch.any(torch.isnan(W_in)) or torch.any(torch.isnan(W_out)):
        print("NaN detected - reinitializing weights")
        W_in = torch.FloatTensor(vocab_size, embedding_dim).uniform_(-scale, scale)
        W_out = torch.FloatTensor(embedding_dim, vocab_size).uniform_(-scale, scale)