In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.decoders import WordPiece as WordPieceDecoder

In [None]:
class WordPieceTokenizer:
    def __init__(self, vocab_size=30000, min_frequency=2):
        self.vocab_size = vocab_size
        self.min_frequency = min_frequency
        self.tokenizer = None

    def train(self, texts):
        self.tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = Whitespace()

        trainer = WordPieceTrainer(
            vocab_size=self.vocab_size,
            min_frequency=self.min_frequency,
            special_tokens=["[UNK]", "[PAD]"]
        )

        self.tokenizer.train_from_iterator(texts, trainer)
        self.tokenizer.decoder = WordPieceDecoder(prefix="##")

    def encode(self, text):
        return self.tokenizer.encode(text).ids

    def encode_tokens(self, text):
        return self.tokenizer.encode(text).tokens

    def decode(self, ids):
        return self.tokenizer.decode(ids)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [4]:
def generate_cbow_pairs(encoded_text, window_size):
    pairs = []
    n = len(encoded_text)
    for i in range(n):
        target = encoded_text[i]
        context = []
        for j in range(i - window_size, i + window_size + 1):
            if j != i and 0 <= j < n:
                context.append(encoded_text[j])
        if context:
            pairs.append((context, target))
    return pairs

In [5]:
class CBOWDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        context, target = self.pairs[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

In [6]:
def collate_fn(batch):
    contexts, targets = zip(*batch)

    max_len = max(len(c) for c in contexts)
    padded_contexts = []

    pad_id = 1
    
    for c in contexts:
        pad = torch.full(max_len - len(c), pad_id, dtype=torch.long)
        padded_contexts.append(torch.cat([c, pad]))

    context_tensor = torch.stack(padded_contexts)   # [B, C]
    target_tensor = torch.stack(targets)            # [B]

    return context_tensor, target_tensor

In [7]:
def build_neg_dist(encoded_corpus, vocab_size):
    counts = torch.zeros(vocab_size)
    for t in encoded_corpus:
        counts[t] += 1
    dist = counts ** 0.75
    dist /= dist.sum()
    return dist

In [17]:
class CBOW(nn.Module):
	def __init__(self, vocab_size, emb_dim): 
		super().__init__() 
		self.in_embed = nn.Embedding(vocab_size, emb_dim) 
		self.out_embed = nn.Embedding(vocab_size, emb_dim) 
	def forward(self, context_ids, target_ids, neg_ids): 
		# context_ids: [B,C] 
		# target_ids: [B]
		# neg_ids: [B,K] 
		context_vec = self.in_embed(context_ids) # [B,C,D] 
		context_mean = context_vec.mean(dim=1) # [B,D] 
		
		target_vec = self.out_embed(target_ids) # [B,D] 
		neg_vec = self.out_embed(neg_ids) # [B,K,D] 
		
		pos_score = torch.sum(context_mean * target_vec, dim=1) # [B] 
		pos_loss = torch.log(torch.sigmoid(pos_score) + 1e-9) 
		
		neg_score = torch.bmm(neg_vec, context_mean.unsqueeze(2)).squeeze(2) # [B,K] 
		neg_loss = torch.log(torch.sigmoid(-neg_score) + 1e-9).sum(dim=1) 
		
		loss = -(pos_loss + neg_loss).mean() 
		return loss
	    

In [None]:
from tqdm.notebook import tqdm
def train_cbow(encoded_corpus, vocab_size, window_size, emb_dim=384,
               batch_size=128, neg_k=10, epochs=5, lr=0.002):

    pairs = generate_cbow_pairs(encoded_corpus, window_size)
    dataset = CBOWDataset(pairs)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    neg_dist = build_neg_dist(encoded_corpus, vocab_size)

    def sample_neg(batch_size, k):
        return torch.multinomial(neg_dist, batch_size * k, replacement=True).view(batch_size, k)

    model = CBOW(vocab_size, emb_dim)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0
        pbar = tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}")
        for context_ids, target_ids in pbar:
            B = context_ids.size(0)
            neg_ids = sample_neg(B, neg_k)

            loss = model(context_ids, target_ids, neg_ids)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            pbar.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}: Loss {total_loss:.4f}")

    return model


In [10]:
with open("AllCombined.txt", "r", encoding="utf-8") as f:
    texts = [line.strip() for line in f if line.strip()]


tokenizer = WordPieceTokenizer(vocab_size=30000)
tokenizer.train(texts)

In [11]:
pad_id = tokenizer.tokenizer.token_to_id("[PAD]")
print("PAD id:", pad_id)

PAD id: 1


In [1]:
encoded_corpus = []
i=0
for t in texts:
    i+=1
    if i%10000 == 0:
    encoded_corpus.extend(tokenizer.encode(t))


IndentationError: expected an indented block after 'if' statement on line 5 (3034930092.py, line 6)

In [None]:
model = train_cbow(encoded_corpus, vocab_size=30000, window_size=5, emb_dim=384,
               batch_size=512, neg_k=10, epochs=3, lr=0.003)
embeddings = model.in_embed.weight

In [None]:
import numpy as np

In [None]:
def build_doc_vector(text, tokenizer, embeddings):
    ids = tokenizer.encode(text)
    
    if len(ids) == 0:
        return np.zeros(embeddings.shape[1])
    
    vecs = embeddings[ids]          # [T,384]
    doc_vec = vecs.mean(dim=0)      # [384]
    return doc_vec.cpu().numpy()

In [None]:
def vectorize_corpus(texts, tokenizer, embeddings):
    X = []
    for t in texts:
        X.append(build_doc_vector(t, tokenizer, embeddings))
    return np.vstack(X)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lines = open("Sentences_50Agree.txt").read().splitlines()
texts = []
labels = []
label_map = {
    ".@negative": 0,
    ".@neutral": 1,
    ".@positive": 2
}
for line in lines:
    parts = line.rsplit(" ", 1)   # split last token
    texts.append(parts[0])
    labels.append(label_map[parts[1]])
X_train = vectorize_corpus(texts, tokenizer, embeddings)
y_train = labels
lines = open("Sentences_AllAgree.txt").read().splitlines()
texts = []
labels = []
for line in lines:
    parts = line.rsplit(" ", 1)   # split last token
    texts.append(parts[0])
    labels.append(label_map[parts[1]])
X_test  = vectorize_corpus(texts, tokenizer, embeddings)
y_test = labels

clf = LogisticRegression(max_iter=2000, solver="liblinear")
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

f1 = f1_score(y_test, pred, average="macro")
print("CBOW F1:", f1)


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

def vader_predict(texts):
    preds = []
    for t in texts:
        score = sia.polarity_scores(t)["compound"]
        
        if score >= 0.05:
            preds.append(2)
        elif score <= -0.05:
            preds.append(0)
        else:
            preds.append(1)
    return preds

vader_pred = vader_predict(texts)
vader_f1 = f1_score(y_test, vader_pred, average="macro")

print("VADER F1:", vader_f1)