## Textual Features Extraction from Transcript 

In [2]:
import os
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

In [3]:
device = torch.device("mps" if torch.mps.is_available() else "cpu")
print("Using device:", device)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)
model.eval()   

Using device: mps


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
def get_bert_embedding(text, max_len=512):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_len
    )
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # CLS token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # (1, 768)
    
    emb = cls_embedding.cpu().numpy().squeeze()
    
    # Normalize
    emb = (emb - emb.mean()) / emb.std()
    
    return emb

In [None]:
def batch_text_embeddings(txt_dir, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    txt_files = [f for f in os.listdir(txt_dir) if f.endswith(".txt")]

    for txt_file in tqdm(txt_files, desc=f"Processing {txt_dir}"):
        save_path = os.path.join(save_dir, txt_file.replace(".txt", ".npy"))
        
        if os.path.exists(save_path):
            continue 
                
        txt_path = os.path.join(txt_dir, txt_file)
        
        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
        
        if len(text) == 0:
            print("⚠ Empty transcript:", txt_file)
            continue
        
        embedding = get_bert_embedding(text)
        np.save(save_path, embedding)

In [7]:
batch_text_embeddings(
    txt_dir="transcripts/train",
    save_dir="features/text/train"
)

Processing transcripts/train: 100%|██████████| 109/109 [00:17<00:00,  6.12it/s]


In [8]:
batch_text_embeddings(
    txt_dir="transcripts/test",
    save_dir="features/text/test"
)

Processing transcripts/test: 100%|██████████| 12/12 [00:01<00:00,  9.77it/s]
