In [1]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
from tqdm import tqdm
import torch

In [2]:
dataset_path = os.path.join(Path.cwd().parent, 'data', 'stackexchange_dataset.csv')
data = pd.read_csv(dataset_path,index_col='question_id')
data = data[~data.index.duplicated()]

In [3]:
import ollama
from transformers import AutoTokenizer, AutoModel
BATCH_SIZE = 100
model_name = 'Qwen/Qwen3-Embedding-8B'

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# all_embeddings = []

In [5]:
data = data[['title','question_text']]
# data_to_embed = data['title'].tolist() + data['question_text'].tolist()
# print(f"Total texts to embed: {len(data_to_embed)}")

In [6]:
# len(data_to_embed)

In [None]:
OUTPUT_PATH = os.path.join(Path.cwd().parent, 'data', 'stackexchange_embeddings_tokenized_f32.h5')
BATCH_SIZE = 4     
MAX_LEN_BODY = 32   
MAX_LEN_TITLE = 4 
EMBED_DIM = 4096

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
        model_name, 
        trust_remote_code=True, 
        device_map="auto", 
        dtype=torch.float32 
    )
model.eval()

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.50s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


Qwen3Model(
  (embed_tokens): Embedding(151665, 4096)
  (layers): ModuleList(
    (0-35): 36 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=4096, out_features=12288, bias=False)
        (up_proj): Linear(in_features=4096, out_features=12288, bias=False)
        (down_proj): Linear(in_features=12288, out_features=4096, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen3RMSNorm((4096,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((4096,), eps=1e-06)
    )
  )
  (norm): Qwen

In [9]:
import h5py

In [None]:
titles = data['title'].tolist()
bodies = data['question_text'].tolist()
ids = data.index.tolist() 
    
num_samples = len(titles)


if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
# This is updated logic with too expensive dtype to compute- 40h for embedding T_T
with h5py.File(OUTPUT_PATH, 'w') as f:
        # Body (Sequence): (N, 256, 4096)
        dset_body = f.create_dataset("body_seq", shape=(num_samples, MAX_LEN_BODY, EMBED_DIM), dtype='float32', chunks=(1, MAX_LEN_BODY, EMBED_DIM))
        
        # Body Mask: (N, 256)
        dset_mask = f.create_dataset("body_mask", shape=(num_samples, MAX_LEN_BODY), dtype='bool')
        
        # Title (Pooled): (N, 4096)
        dset_title = f.create_dataset("title_emb", shape=(num_samples, EMBED_DIM), dtype='float32')
        
        dset_ids = f.create_dataset("question_ids", shape=(num_samples,), dtype='i8')

        print("Starting extraction...")
        
        for i in tqdm(range(0, num_samples, BATCH_SIZE)):
            batch_end = min(i + BATCH_SIZE, num_samples)
            
            batch_titles = titles[i:batch_end]
            batch_bodies = bodies[i:batch_end]
            batch_ids = ids[i:batch_end] 

            # --- PROCESS BODY (Sequence) ---
            inputs_body = tokenizer(batch_bodies, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LEN_BODY).to(model.device)

            with torch.no_grad():
                out_body = model(**inputs_body, output_hidden_states=True)
                embeddings_body = out_body.last_hidden_state
                embeddings_body = torch.clamp(embeddings_body, min=-1e5, max=1e5)
                embeddings_body = embeddings_body.cpu().numpy()
                masks_body = inputs_body.attention_mask.cpu().numpy().astype(bool)

            # --- PROCESS TITLE (Pooled) ---
            inputs_title = tokenizer(batch_titles, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LEN_TITLE).to(model.device)

            with torch.no_grad():
                out_title = model(**inputs_title, output_hidden_states=True)
                # Mean Pooling Logic
                token_embeddings = out_title.last_hidden_state
                input_mask_expanded = inputs_title.attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
                sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                pooled_title = (sum_embeddings / sum_mask)
                pooled_title = torch.clamp(pooled_title, min=-1e5, max=1e5)
                
                pooled_title = pooled_title.cpu().numpy()

            # --- WRITE DIRECTLY TO DISK ---
            dset_body[i:batch_end] = embeddings_body
            dset_mask[i:batch_end] = masks_body
            dset_title[i:batch_end] = pooled_title
            dset_ids[i:batch_end] = batch_ids

print(f"Done! Saved to {OUTPUT_PATH}")

Starting extraction...


  0%|          | 0/24998 [00:00<?, ?it/s]

  0%|          | 3/24998 [00:36<85:06:47, 12.26s/it]


KeyboardInterrupt: 