In [1]:
import sys
import numpy as np
import torch
from pathlib import Path

# Printing versions and GPU info
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"Torch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU Devices: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")

Python: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
NumPy: 1.26.4
Torch: 2.5.1+cu124
CUDA available: True
GPU Devices: 2
Current GPU: Tesla T4


In [2]:
BASE_DIR = Path("/kaggle/working")
print(f"Working directory: {BASE_DIR}")

DATASET_NAME = "data-for-nlp-major"
DATA_DIR = Path(f"/kaggle/input/{DATASET_NAME}")

CHECKPOINT_DIR = BASE_DIR / "checkpoints"
CHECKPOINT_DIR.mkdir(exist_ok=True)

data_file_1 = DATA_DIR / "final_with_labels.json"
data_file_2 = DATA_DIR / "final_without_labels.json"
data_file_3 = DATA_DIR / "final_combined.json"
data_file_4 = DATA_DIR / "final_without_labels_2.json"
checkpoint_path = CHECKPOINT_DIR / "w2v_checkpoint_gpu.npz"

print(f"Data file 1: {data_file_1}")
print(f"Data file 2: {data_file_2}")
print(f"Data file 3: {data_file_3}")
print(f"Data file 4: {data_file_4}")
print(f"Checkpoint path: {checkpoint_path}")

Working directory: /kaggle/working
Data file 1: /kaggle/input/data-for-nlp-major/final_with_labels.json
Data file 2: /kaggle/input/data-for-nlp-major/final_without_labels.json
Data file 3: /kaggle/input/data-for-nlp-major/final_combined.json
Data file 4: /kaggle/input/data-for-nlp-major/final_without_labels_2.json
Checkpoint path: /kaggle/working/checkpoints/w2v_checkpoint_gpu.npz


In [3]:
from collections import Counter
import json
from pathlib import Path
import os

print("=== Vocabulary Builder ===")

try:
    def load_json(path):
        with open(path, 'r', encoding='utf-8') as f:
            arr = json.load(f)
        if not isinstance(arr, list) or not arr:
            raise ValueError(f"{path.name} must be a non-empty list")
        if "sentence" not in arr[0]:
            raise ValueError(f"{path.name} entries require a sentence field")
        return arr

    # Defining paths for vocab-building files
    for path in (data_file_1, data_file_2, data_file_4):
        if not path.exists():
            raise FileNotFoundError(f"Missing vocab source: {path}")
    
    print("Loading data_file_1, data_file_2 and data_file_4...")
    data1 = load_json(data_file_1)
    data2 = load_json(data_file_2)
    data4 = load_json(data_file_4)

    print("Processing sentences for vocab...")
    all_vocab_sentences = data1 + data2 + data4
    words = [w for entry in all_vocab_sentences for w in entry["sentence"]]
    word_counts = Counter(words)
    vocab = list(word_counts.keys())

    word2idx = {w: i for i, w in enumerate(vocab)}
    idx2word = {i: w for w, i in word2idx.items()}
    vocab_size = len(vocab)

    print(f"\nVocabulary built!")
    print(f" Total tokens: {len(words):,}")
    print(f" Unique words: {vocab_size:,}")
    print(f" Top-5 frequent: {word_counts.most_common(5)}")
    print("\nSample mappings:")
    for w in vocab[:10]:
        print(f"  '{w}': {word2idx[w]}")

    # saving vocabulary file
    vocab_path = CHECKPOINT_DIR / "vocabulary.json"
    with open(vocab_path, 'w', encoding='utf-8') as f:
        json.dump({"word2idx": word2idx, "idx2word": idx2word},
                  f, ensure_ascii=False, indent=2)
    print(f"\nVocabulary saved to: {vocab_path}")

    if not data_file_3.exists():
        raise FileNotFoundError(f"Missing pairs source: {data_file_3}")

except Exception as e:
    print(f"\nError: {e}")

=== Vocabulary Builder ===
Loading data_file_1, data_file_2 and data_file_4...
Processing sentences for vocab...

Vocabulary built!
 Total tokens: 13,889,116
 Unique words: 128,973
 Top-5 frequent: [('stock', 391365), ('rt', 362726), ('spx', 180876), ('aapl', 124425), ('spy', 116542)]

Sample mappings:
  'jpmorgan': 0
  'reel': 1
  'expectation': 2
  'beyond': 3
  'meat': 4
  'nomura': 5
  'point': 6
  'booking': 7
  'weakness': 8
  'carnival': 9

Vocabulary saved to: /kaggle/working/checkpoints/vocabulary.json


In [4]:
print("\nLoading data_file_3 and data_file_4 for training-pair generation...")
data = load_json(data_file_3) + data4

def generate_training_data(data, window_size=10):
    pairs = []
    for entry in data:
        sentence = entry["sentence"]
        sentence_length = len(sentence)
        for i, center in enumerate(sentence):
            if center not in word2idx:
                continue
            for j in range(max(0, i - window_size), min(sentence_length, i + window_size + 1)):
                if i != j and sentence[j] in word2idx:
                    pairs.append((word2idx[center], word2idx[sentence[j]]))
    return pairs

training_pairs = generate_training_data(data, window_size=5)
print(f"Number of training pairs: {len(training_pairs)}")


Loading data_file_3 and data_file_4 for training-pair generation...
Number of training pairs: 106363598


In [6]:
import torch
import os

def sigmoid(x):
    return 1 / (1 + torch.exp(-torch.clamp(x, -10, 10)))

def normalize_rows(mat):
    norm = torch.norm(mat, dim=1, keepdim=True)
    return mat / (norm + 1e-8)

# Setting device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Configurations
embedding_dim = 300
learning_rate = 0.0008
negative_samples = 10
batch_size = 4096

# Adam optimizer hyperparameters
beta1 = 0.9
beta2 = 0.999
eps = 1e-8

CHECKPOINT_DIR = "/kaggle/working/checkpoints"
checkpoint_path = os.path.join(CHECKPOINT_DIR, "model.pt")
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Early stopping
patience = 3
min_delta = 1e-3
best_loss = float("inf")
epochs_without_improve = 0

# Prepare training pairs
if isinstance(training_pairs, list):
    training_pairs = torch.tensor(training_pairs, dtype=torch.long)
elif isinstance(training_pairs, torch.Tensor):
    training_pairs = training_pairs.clone().detach().to(dtype=torch.long)
else:
    raise ValueError("training_pairs must be either list or torch.Tensor")

training_pairs = training_pairs.to(device)
vocab_size = len(word2idx)

# Model Initialization
start_epoch = 0

if os.path.exists(checkpoint_path):
    try:
        ckpt = torch.load(checkpoint_path, map_location=device, weights_only=True)
    except:
        ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)

    W_in = ckpt["W_in"].float().to(device)
    W_out = ckpt["W_out"].float().to(device)
    start_epoch = int(ckpt["epoch"])
    m_in = ckpt["m_in"].to(device)
    v_in = ckpt["v_in"].to(device)
    m_out = ckpt["m_out"].to(device)
    v_out = ckpt["v_out"].to(device)
    t_step = int(ckpt["t_step"])
    print(f"Resumed from epoch {start_epoch}")

    if torch.isnan(W_in).any() or torch.isnan(W_out).any():
        print("Checkpoint contains NaNs — resetting weights.")
        W_in = torch.empty(vocab_size, embedding_dim, device=device)
        W_out = torch.empty(embedding_dim, vocab_size, device=device)
        torch.nn.init.xavier_uniform_(W_in)
        torch.nn.init.xavier_uniform_(W_out)
        start_epoch = 0
else:
    print("No checkpoint found. Initializing weights using Xavier.")
    W_in = torch.empty(vocab_size, embedding_dim, device=device)
    W_out = torch.empty(embedding_dim, vocab_size, device=device)
    torch.nn.init.xavier_uniform_(W_in)
    torch.nn.init.xavier_uniform_(W_out)
    m_in = torch.zeros_like(W_in)
    v_in = torch.zeros_like(W_in)
    m_out = torch.zeros_like(W_out)
    v_out = torch.zeros_like(W_out)
    t_step = 0

# Negative Sampling
word_freq = torch.tensor([word_counts[idx2word[i]] for i in range(vocab_size)],
                         dtype=torch.float64).to(device)
unigram_dist = word_freq ** 0.75
unigram_dist /= unigram_dist.sum()

def get_negative_samples(batch_size, K):
    return torch.multinomial(unigram_dist, num_samples=K * batch_size,
                              replacement=True).view(batch_size, K).to(device)

# Training Loop
ema_loss = None
ema_decay = 0.95
max_grad_norm = 5.0 

# Scheduler Settings
scheduler_gamma = 0.5
scheduler_step = 5

while True: 
    epoch = start_epoch
    start_epoch += 1 

    perm = torch.randperm(len(training_pairs), device=device)
    training_pairs = training_pairs[perm]

    total_loss = 0.0
    num_batches = 0

    for i in range(0, len(training_pairs), batch_size):
        t_step += 1
        batch = training_pairs[i:i + batch_size]
        center_idxs = batch[:, 0].long()
        target_idxs = batch[:, 1].long()

        # Forward pass
        v_c = W_in[center_idxs]
        u_o = W_out[:, target_idxs]

        # Positive samples
        score_pos = torch.sum(v_c * u_o.T, dim=1)
        score_pos = torch.clamp(score_pos, -10, 10)
        pred_pos = sigmoid(score_pos)

        # Negative samples
        neg_ids = get_negative_samples(len(center_idxs), negative_samples).long()
        u_k = W_out[:, neg_ids.T].permute(2, 1, 0)
        score_neg = torch.sum(v_c[:, None, :] * u_k, dim=2)
        score_neg = torch.clamp(score_neg, -10, 10)
        pred_neg = sigmoid(-score_neg)

        # Loss calculation
        pred_pos = torch.clamp(pred_pos, 1e-7, 1 - 1e-7)
        pred_neg = torch.clamp(pred_neg, 1e-7, 1 - 1e-7)
        loss = -torch.log(pred_pos).mean() - torch.log(pred_neg).mean()
        total_loss += float(loss)
        num_batches += 1

        # EMA Loss
        ema_loss = loss if ema_loss is None else ema_decay * ema_loss + (1 - ema_decay) * loss

        # Gradients for W_in
        grad_pos = (pred_pos - 1).unsqueeze(1) * u_o.T
        grad_neg = (1 - pred_neg).unsqueeze(2) * u_k
        grad_in = grad_pos + grad_neg.mean(dim=1)

        # Gradient clipping
        grad_in = torch.clamp(grad_in, -max_grad_norm, max_grad_norm)

        # Adam update for W_in
        m_in[center_idxs] = beta1 * m_in[center_idxs] + (1 - beta1) * grad_in
        v_in[center_idxs] = beta2 * v_in[center_idxs] + (1 - beta2) * (grad_in ** 2)
        m_hat = m_in[center_idxs] / (1 - beta1 ** t_step)
        v_hat = v_in[center_idxs] / (1 - beta2 ** t_step)
        W_in[center_idxs] -= learning_rate * m_hat / (torch.sqrt(v_hat) + eps)

        # Gradients for W_out
        update_W_out = torch.zeros_like(W_out)
        update_W_out.scatter_add_(1, target_idxs.unsqueeze(0), (pred_pos - 1) * v_c.T)
        for k in range(negative_samples):
            update_W_out.scatter_add_(1, neg_ids[:, k].unsqueeze(0), grad_neg[:, k].transpose(0, 1))

        updated_cols = torch.unique(torch.cat((target_idxs, neg_ids.flatten())))
        m_out[:, updated_cols] = beta1 * m_out[:, updated_cols] + (1 - beta1) * update_W_out[:, updated_cols]
        v_out[:, updated_cols] = beta2 * v_out[:, updated_cols] + (1 - beta2) * (update_W_out[:, updated_cols] ** 2)
        m_hat_out = m_out[:, updated_cols] / (1 - beta1 ** t_step)
        v_hat_out = v_out[:, updated_cols] / (1 - beta2 ** t_step)
        W_out[:, updated_cols] -= learning_rate * m_hat_out / (torch.sqrt(v_hat_out) + eps)

    # End of epoch
    avg_loss = total_loss / num_batches
    grad_norm = torch.norm(grad_in, dim=1).mean()
    print(f"Epoch {epoch+1}, Avg Loss (EMA): {ema_loss:.4f}, Batch Avg Loss: {avg_loss:.4f}, Grad Norm: {grad_norm:.4f}")

    # Save checkpoint
    torch.save({"W_in": W_in,"W_out": W_out,"m_in": m_in,"v_in": v_in,"m_out": m_out,"v_out": v_out,"t_step": t_step,"epoch": epoch + 1}, checkpoint_path)

    # Mild regularization
    W_out = normalize_rows(W_out.T).T * 0.99 + W_out * 0.01

    # Learning Rate Scheduler
    if (epoch + 1) % scheduler_step == 0:
        learning_rate *= scheduler_gamma
        print(f"Scheduler: Reducing learning rate to {learning_rate:.6f}")
    
    # Early Stopping
    if best_loss - avg_loss > min_delta:
        best_loss = avg_loss
        epochs_without_improve = 0
    else:
        epochs_without_improve += 1
        print(f"No improvement. {epochs_without_improve}/{patience} patience used.")
        if epochs_without_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best loss: {best_loss:.4f}")
            break

Using device: cuda
No checkpoint found. Initializing weights using Xavier.
Epoch 1, Avg Loss (EMA): 1.3359, Batch Avg Loss: 1.3493, Grad Norm: 0.0396
Epoch 2, Avg Loss (EMA): 1.2792, Batch Avg Loss: 1.2967, Grad Norm: 0.2164
Epoch 3, Avg Loss (EMA): 1.2305, Batch Avg Loss: 1.2465, Grad Norm: 0.3797
Epoch 4, Avg Loss (EMA): 1.2214, Batch Avg Loss: 1.2241, Grad Norm: 0.3954
Epoch 5, Avg Loss (EMA): 1.2187, Batch Avg Loss: 1.2188, Grad Norm: 0.3948
Scheduler: Reducing learning rate to 0.000400
Epoch 6, Avg Loss (EMA): 1.2158, Batch Avg Loss: 1.2157, Grad Norm: 0.3937
Epoch 7, Avg Loss (EMA): 1.2154, Batch Avg Loss: 1.2145, Grad Norm: 0.3934
Epoch 8, Avg Loss (EMA): 1.2132, Batch Avg Loss: 1.2135, Grad Norm: 0.3907
No improvement. 1/3 patience used.
Epoch 9, Avg Loss (EMA): 1.2134, Batch Avg Loss: 1.2127, Grad Norm: 0.3888
Epoch 10, Avg Loss (EMA): 1.2122, Batch Avg Loss: 1.2121, Grad Norm: 0.3855
Scheduler: Reducing learning rate to 0.000200
No improvement. 1/3 patience used.
Epoch 11, Av

In [7]:
import numpy as np
import os

def save_full_w2v_model(W_in, word2idx, idx2word, checkpoint_dir="./checkpoints"):
    os.makedirs(checkpoint_dir, exist_ok=True)

    try:
        embeddings_path = os.path.join(checkpoint_dir, "final_w2v_embeddings.npy")
        np.save(embeddings_path, W_in.cpu().numpy() if isinstance(W_in, torch.Tensor) else W_in)
        vocab_path = os.path.join(checkpoint_dir, "final_w2v_vocabulary.npz")
        np.savez_compressed(vocab_path, word2idx=word2idx, idx2word=idx2word)

        print(f"Full model saved to: {checkpoint_dir}")
        print(f"  - Embeddings: {embeddings_path}")
        print(f"  - Vocabulary: {vocab_path}")
        return True
    except Exception as e:
        print(f"Failed to save model: {str(e)}")
        return False
        
save_full_w2v_model(W_in, word2idx, idx2word)

Full model saved to: ./checkpoints
  - Embeddings: ./checkpoints/final_w2v_embeddings.npy
  - Vocabulary: ./checkpoints/final_w2v_vocabulary.npz


True

In [9]:
from gensim.models import FastText
import os

sentences = [entry["sentence"] for entry in data]

fasttext_model = FastText(
    vector_size=300,       
    window=5,              
    min_count=1,           
    workers=4,             
    sg=1,                  
    epochs=20              
)

fasttext_model.build_vocab(sentences)
fasttext_model.train(sentences, total_examples=len(sentences), epochs=fasttext_model.epochs)

fasttext_embeddings = {word: fasttext_model.wv[word] for word in word2idx.keys() if word in fasttext_model.wv}

save_dir = "./checkpoints"
os.makedirs(save_dir, exist_ok=True)
fasttext_model.save(os.path.join(save_dir, "fasttext.model"))

print(f"FastText training complete. Embeddings available for {len(fasttext_embeddings)} words.")

FastText training complete. Embeddings available for 128973 words.


In [10]:
import numpy as np
import os
import torch

def save_hybrid_model(W_in, fasttext_model, word2idx, idx2word, checkpoint_dir="./checkpoints"):
    os.makedirs(checkpoint_dir, exist_ok=True)

    try:
        W_in_np = W_in.cpu().numpy() if isinstance(W_in, torch.Tensor) else W_in
        
        hybrid_embeddings = np.zeros_like(W_in_np)
        missing_words = []
        
        for word, idx in word2idx.items():
            if word in fasttext_model.wv:
                hybrid_embeddings[idx] = 0.5 * (W_in_np[idx] + fasttext_model.wv[word])
            else:
                hybrid_embeddings[idx] = W_in_np[idx]
                missing_words.append(word)

        embeddings_path = os.path.join(checkpoint_dir, "hybrid_embeddings.npy")
        np.save(embeddings_path, hybrid_embeddings)

        vocab_path = os.path.join(checkpoint_dir, "hybrid_vocabulary.npz")
        np.savez_compressed(vocab_path, word2idx=word2idx, idx2word=idx2word)

        print(f"Hybrid model saved to: {checkpoint_dir}")
        print(f"  - Embeddings: {embeddings_path}")
        print(f"  - Vocabulary: {vocab_path}")
        print(f"  - Note: {len(missing_words)} words used original W2V (missing in FastText)")
        return True
        
    except Exception as e:
        print(f"Failed to save hybrid model: {str(e)}")
        return False

save_hybrid_model(W_in, fasttext_model, word2idx, idx2word)

Hybrid model saved to: ./checkpoints
  - Embeddings: ./checkpoints/hybrid_embeddings.npy
  - Vocabulary: ./checkpoints/hybrid_vocabulary.npz
  - Note: 0 words used original W2V (missing in FastText)


True

In [11]:
import numpy as np
from scipy.sparse import dok_matrix, csr_matrix

def build_cooccurrence_matrix_sparse(training_pairs, vocab_size, verbose=True):
    cooccur_dok = dok_matrix((vocab_size, vocab_size), dtype=np.float32)
    
    if hasattr(training_pairs, 'cpu'):
        pairs = training_pairs.cpu().numpy()
    else:
        pairs = np.array(training_pairs, dtype=np.int32)

    if verbose:
        print(f"Building sparse co-occurrence from {len(pairs):,} pairs...")

    for center, context in pairs:
        if center < vocab_size and context < vocab_size:
            cooccur_dok[center, context] += 1.0

    cooccur_csr = cooccur_dok.tocsr()

    cooccur_csr.data[:] = np.log1p(cooccur_csr.data)

    if verbose:
        nnz = cooccur_csr.count_nonzero()
        total = vocab_size * vocab_size
        print(f"Sparse matrix built, shape={cooccur_csr.shape}, nnz={nnz:,} ({nnz/total:.2%})")

    return cooccur_csr

cooccur = build_cooccurrence_matrix_sparse(training_pairs, vocab_size=len(word2idx))

Building sparse co-occurrence from 106,363,598 pairs...
Sparse matrix built, shape=(128973, 128973), nnz=12,776,411 (0.08%)


In [12]:
from sklearn.decomposition import TruncatedSVD
import torch

def enhance_embeddings_with_svd(cooccur, W_in, embedding_dim, weight_in=0.7, weight_glove=0.3):
    try:
        is_tensor = isinstance(W_in, torch.Tensor)
        if is_tensor:
            device = W_in.device
            W_np = W_in.cpu().numpy()
        else:
            W_np = W_in

        svd = TruncatedSVD(n_components=embedding_dim)
        glove_components = svd.fit_transform(cooccur)

        if glove_components.shape[0] != W_np.shape[0]:
            raise ValueError(f"Dimension mismatch: W_in has {W_np.shape[0]} rows, "
                           f"but cooccur has {glove_components.shape[0]} rows")

        enhanced_np = weight_in * W_np + weight_glove * glove_components

        if is_tensor:
            enhanced_embeddings = torch.from_numpy(enhanced_np).to(device)
        else:
            enhanced_embeddings = enhanced_np

        return enhanced_embeddings, glove_components

    except Exception as e:
        print(f"Error in enhance_embeddings_with_svd: {str(e)}")
        raise

enhanced_embeddings, glove_components = enhance_embeddings_with_svd(cooccur, W_in, embedding_dim)

In [13]:
import numpy as np
from pathlib import Path
import torch

def save_glove_w2v_enhanced(enhanced_embeddings, word2idx, idx2word, model_name="glove_w2v_enhanced", checkpoint_dir="./checkpoints"):
    try:
        checkpoint_dir = Path(checkpoint_dir)
        checkpoint_dir.mkdir(parents=True, exist_ok=True)

        if isinstance(enhanced_embeddings, torch.Tensor):
            enhanced_embeddings = enhanced_embeddings.cpu().numpy()
        elif hasattr(enhanced_embeddings, '__cuda_array_interface__'):  # Handle CuPy arrays
            import cupy as cp
            enhanced_embeddings = cp.asnumpy(enhanced_embeddings)

        embeddings_path = checkpoint_dir / f"{model_name}_embeddings.npy"
        np.save(str(embeddings_path), enhanced_embeddings)

        vocab_path = checkpoint_dir / f"{model_name}_vocabulary.npz"
        np.savez_compressed(str(vocab_path), word2idx=word2idx, idx2word=idx2word)

        print(f"Enhanced model saved to: {checkpoint_dir}")
        print(f"  - Embeddings: {embeddings_path}")
        print(f"  - Vocabulary: {vocab_path}")
        return True

    except Exception as e:
        print(f"Failed to save enhanced model: {str(e)}")
        return False

save_success = save_glove_w2v_enhanced(enhanced_embeddings=enhanced_embeddings, word2idx=word2idx, idx2word=idx2word, model_name="glove_w2v_enhanced")

Enhanced model saved to: checkpoints
  - Embeddings: checkpoints/glove_w2v_enhanced_embeddings.npy
  - Vocabulary: checkpoints/glove_w2v_enhanced_vocabulary.npz


In [14]:
import numpy as np
import torch

def combine_embeddings(W_in, fasttext_model, glove_components, idx2word, weights=(0.5, 0.3, 0.2)):
    try:
        is_tensor = isinstance(W_in, torch.Tensor)
        if is_tensor:
            device = W_in.device
            W_np = W_in.cpu().numpy()
        else:
            W_np = W_in

        fasttext_vecs = []
        missing_words = []
        
        for word in idx2word.values():
            if word in fasttext_model.wv:
                fasttext_vecs.append(fasttext_model.wv[word])
            else:
                fasttext_vecs.append(np.zeros(W_np.shape[1]))  
                missing_words.append(word)
                
        fasttext_vecs = np.array(fasttext_vecs)
        
        if missing_words:
            print(f"Note: Used zero vectors for {len(missing_words)} words missing in FastText")

        if W_np.shape != fasttext_vecs.shape:
            raise ValueError(f"Shape mismatch: W_in {W_np.shape} vs FastText {fasttext_vecs.shape}")
        if W_np.shape[0] != glove_components.shape[0]:
            raise ValueError(f"Row count mismatch: W_in {W_np.shape[0]} vs GloVe {glove_components.shape[0]}")

        w1, w2, w3 = weights
        combined_np = w1 * W_np + w2 * fasttext_vecs + w3 * glove_components

        if is_tensor:
            return torch.from_numpy(combined_np).to(device)
        return combined_np

    except Exception as e:
        print(f"Error combining embeddings: {str(e)}")
        raise

final_embeddings = combine_embeddings(
    W_in=W_in, 
    fasttext_model=fasttext_model,
    glove_components=glove_components,
    idx2word=idx2word,
    weights=(0.5, 0.3, 0.2)
)

In [15]:
import numpy as np
from pathlib import Path
import torch

def save_combined_embeddings(final_embeddings, model_name="w2v_fasttext_glove_combined", checkpoint_dir="./checkpoints"):
    try:
        checkpoint_dir = Path(checkpoint_dir)
        checkpoint_dir.mkdir(parents=True, exist_ok=True)

        if isinstance(final_embeddings, torch.Tensor):
            final_embeddings = final_embeddings.cpu().numpy()
        elif hasattr(final_embeddings, '__cuda_array_interface__'):
            import cupy as cp
            final_embeddings = cp.asnumpy(final_embeddings)

        embeddings_path = checkpoint_dir / f"{model_name}_embeddings.npy"
        np.save(str(embeddings_path), final_embeddings)

        print(f"Combined embeddings saved to: {embeddings_path}")
        return True

    except Exception as e:
        print(f"Failed to save combined embeddings: {str(e)}")
        return False

save_combined_embeddings(final_embeddings)

Combined embeddings saved to: checkpoints/w2v_fasttext_glove_combined_embeddings.npy


True

In [17]:
# import shutil

# src_dir = "/kaggle/working/checkpoints"
# archive_name = "/kaggle/working/checkpoints"  

# shutil.make_archive(archive_name, 'zip', src_dir)
# print("Created archive at:", archive_name + ".zip")

Created archive at: /kaggle/working/checkpoints.zip
