In [2]:

import argparse
import csv
import math
import os
import random
from collections import Counter, defaultdict

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# ######################################
# Utilities: dataset & vocab

class SimpleVocab:
    def __init__(self, min_freq=0, reserved_tokens=None):
        self.min_freq = min_freq
        self.freqs = Counter() # frequency counter for tokens
        self.token_to_idx = {}
        self.idx_to_token = []
        if reserved_tokens is None:
            reserved_tokens = ["<pad>", "<unk>", "<cls>"]
        for t in reserved_tokens:
            self.add_token(t)
        self.reserved_tokens = reserved_tokens

    def add_token(self, t): # Dynamic token adder
        if t in self.token_to_idx:
            return # already added , return nothing?
        idx = len(self.idx_to_token) # if new token
        self.token_to_idx[t] = idx # token to idx is dict
        self.idx_to_token.append(t) # list, although why do we need a list ``\../``

    def add_sentence(self, sent_tokens):
        self.freqs.update(sent_tokens)

    def build_vocab(self):
        # add tokens meeting min_freq
        for token, freq in self.freqs.most_common():
            if freq >= self.min_freq and token not in self.token_to_idx:
                self.add_token(token)

    def __len__(self):
        return len(self.idx_to_token)

    def encode(self, tokens):
        unk = self.token_to_idx.get("<unk>")
        return [self.token_to_idx.get(t, unk) for t in tokens] # return token ids, if token unknown then return unknown, get does is if t not in token_to_idx then return unk

In [3]:

def simple_tokenize(text):
    # basic lowercase whitespace tokenizer 
    text = text.lower()
   
    for ch in [",", ".", ";", ":", "!", "?", "(", ")", "\"", "'"]:
        text = text.replace(ch, " ")
    tokens = text.strip().split()
    return tokens

In [4]:
sample_text=simple_tokenize("Hello, world! This is a test.")
print(sample_text)

import pandas as pd
df=pd.read_csv("new_shape_dataset.csv")

['hello', 'world', 'this', 'is', 'a', 'test']


In [5]:
sample_text=simple_tokenize("Hello, world! This is a test.")
k=df["sentence"][:4]
k=list(k)
sent=""
for i in k:
    sent=simple_tokenize(i)
# k.rows = list(zip(df["sentence"].str.strip(), df["type"], df["size"], df["stiffness"]))


print(sample_text)

['hello', 'world', 'this', 'is', 'a', 'test']


In [6]:
sentence,shape, size, stiff = df.columns[0], df.columns[1], df.columns[2], df.columns[3]

print(type(size), size, stiff)
print(df.columns[0])

<class 'str'> size stiffness
sentence


In [7]:
# example=list(df.loc[:5,"sentence"])
# print(example)
# vocab=SimpleVocab()
# for sent in example:
#     tokens = simple_tokenize(sent)
#     vocab.add_sentence(tokens)
#     vocab.add_sentence(sent)
# vocab.build_vocab()
# toks=example[0]
# j=vocab.encode(toks)

In [8]:
# print(len(vocab))

In [9]:
k=df["sentence"][:4].str.strip() # This line selects the first four rows of the "sentence" column, removes any leading or trailing whitespace, and assigns the result to the variable k.
# print(k)

k=[i for i in k]
print(k)
print(len(k))
type(df["type"][:4].values)

['this marble is giant and stiff', 'a soft round object was miniature', 'bulky and flexible describes the tube', 'the canister was solid and large']
4


numpy.ndarray

In [43]:

class MultiTaskTextDataset(Dataset):
    def __init__(self, csv_path, vocab=None, max_len=10, build_vocab=True): # max len is the contenxt size
        # read CSV
        self.df=pd.read_csv(csv_path)
        self.row,self.shape,self.size,self.stiff= self.df["sentence"],self.df["type"], self.df["size"], self.df["stiffness"]
        self.max_len = max_len
        self.vocab = vocab
        self.data=[]
   
        # build vocab if requested
        if vocab is None and build_vocab:
            self.vocab = SimpleVocab(min_freq=1)
            for text in self.row:
                toks = simple_tokenize(text)
                self.vocab.add_sentence(toks)
            self.vocab.build_vocab()
        elif vocab is None:
            raise ValueError("Provide vocab or set build_vocab=True")
        else:
            self.vocab = vocab

        # tokenize-> encode-> append values
        # self.data = []
        # print(f"Vocab size: {len(self.vocab)}")
        for text, shape, size, stiff in zip(self.row, self.shape, self.size, self.stiff):
            toks = simple_tokenize(text)
            enc = self.vocab.encode(toks)
            # Add CLS at start
            cls_idx = self.vocab.token_to_idx["<cls>"]
            enc = [cls_idx] + enc
            # pad/truncate
            if len(enc) < self.max_len:
                enc = enc + [self.vocab.token_to_idx["<pad>"]] * (self.max_len - len(enc))
            else:
                enc = enc[: self.max_len]
            # print(f"Encoded: {enc} shape: {shape} size: {size} stiffness: {stiff} ")
            try:
                self.data.append((
                    torch.tensor(enc, dtype=torch.long),
                    torch.tensor(shape-1, dtype=torch.long),
                    torch.tensor(size-1, dtype=torch.long),
                    torch.tensor(stiff-1, dtype=torch.long),
                ))
            except:
                pass

    def __len__(self):
        return len(self.data)
    
    def get_data(self):
        return self.data

    def __getitem__(self, idx):
        return self.data[idx]
    def get_vocab_size(self):
        return len(self.vocab)

In [38]:
df=pd.read_csv("new_shape_dataset.csv")


path = "new_shape_dataset.csv"
multi_dataset=MultiTaskTextDataset(path)
multi_dataset.get_vocab_size()

Vocab size: 110


110

In [12]:
j=multi_dataset.get_data()[0]
print(type(j))

<class 'tuple'>


In [13]:
x = torch.tensor([1, 2, 3, 4])
print(x.shape)
y=torch.unsqueeze(x, 0)
print(y.shape)
z=torch.unsqueeze(x, 1)
print(z.shape)
print(z)

torch.Size([4])
torch.Size([1, 4])
torch.Size([4, 1])
tensor([[1],
        [2],
        [3],
        [4]])


In [14]:


# Model: PosEncoding hi copilot are you there? 
# -------------------------
class PositionalEncoding(nn.Module): 
    def __init__(self, d_model, max_len=512): # max_len is what exactly -> context size?
        super().__init__()
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()  # (max_len,1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer("pe", pe) # what does register buffer do
        # used to register a buffer that should not be considered a model parameter
        # persistant is across batches meaning, non persistant means it wont be a part of state_dict 

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len]
        return x


In [15]:

class VanillaTransformerMultiTask(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, hidden_dim,
                 num_shape, num_size, num_stiff, max_len=32, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=0) # what is padding_idx?
        # This module is often used to store word embeddings and retrieve them using indices. 
        # The input to the module is a list of indices, and the output is the corresponding word embeddings.
        # padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i
        self.pos_enc = PositionalEncoding(d_model, max_len=max_len)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,
                                                   dim_feedforward=hidden_dim,
                                                   dropout=dropout, activation="relu")
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # pooling: use token 0 (we placed <cls> at index 0) as pooled
        self.dense = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

        # output heads
        self.head_shape = nn.Linear(d_model, num_shape)
        self.head_size = nn.Linear(d_model, num_size)
        self.head_stiff = nn.Linear(d_model, num_stiff)

        self._init_weights()

    def _init_weights(self):

        #  nn.init.xavier_uniform_ sets the weights so that the variance is the same across layers, which helps with stable training.       
        # how stable? i dont know
        nn.init.xavier_uniform_(self.token_emb.weight)
        nn.init.xavier_uniform_(self.dense.weight)
        nn.init.xavier_uniform_(self.head_shape.weight)
        nn.init.xavier_uniform_(self.head_size.weight)
        nn.init.xavier_uniform_(self.head_stiff.weight)

    def forward(self, input_ids, src_key_padding_mask=None):
        # what is src_key_padding?
        # input_ids: (batch, seq_len)
        x = self.token_emb(input_ids)  # (batch, seq_len, d_model)
        x = self.pos_enc(x)  # x: (batch, seq_len, d_model)
        # transformer expects (seq_len, batch, d_model)
        x = x.transpose(0, 1)
        # src_key_padding_mask: (batch, seq_len) boolean: True for padded positions
        enc = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)  # (seq_len, batch, d_model)
        enc = enc.transpose(0, 1)  # (batch, seq_len, d_model)
        cls_token = enc[:, 0, :]  # (batch, d_model)
        pooled = torch.tanh(self.dense(cls_token)) # what does pooling do here? 
        pooled = self.dropout(pooled)
        out_shape = self.head_shape(pooled)
        out_size = self.head_size(pooled)
        out_stiff = self.head_stiff(pooled)
        return out_shape, out_size, out_stiff



In [16]:

# -------------------------
# Training & Eval helpers
# -------------------------

# what does it do 
def collate_batch(batch):

    # batch: list of tuples: (enc, w, b, g)
    encs = torch.stack([b[0] for b in batch], dim=0)
    shp = torch.stack([b[1] for b in batch], dim=0)
    sze = torch.stack([b[2] for b in batch], dim=0)
    stiff = torch.stack([b[3] for b in batch], dim=0)
    # padding mask: True where pad token (token idx 0) exists
    pad_mask = encs == 0
    return encs, pad_mask, shp, sze, stiff



def accuracy(preds, labels):
    return (preds.argmax(dim=1) == labels).float().mean().item()    


In [44]:
df=pd.read_csv("new_shape_dataset.csv")


path = "new_shape_dataset.csv"
sample=multi_dataset=MultiTaskTextDataset(path)
sample.get_vocab_size()
data=sample.get_data()
data


[(tensor([ 2, 22, 48,  7, 23,  4, 12,  0,  0,  0]),
  tensor(2),
  tensor(1),
  tensor(1)),
 (tensor([ 2,  5, 40, 80, 81,  3, 44,  0,  0,  0]),
  tensor(2),
  tensor(0),
  tensor(0)),
 (tensor([ 2, 53,  4, 31, 25,  6, 34,  0,  0,  0]),
  tensor(0),
  tensor(1),
  tensor(0)),
 (tensor([ 2,  6, 33,  3, 15,  4, 11,  0,  0,  0]),
  tensor(0),
  tensor(1),
  tensor(1)),
 (tensor([ 2,  6,  9, 59, 17, 12,  0,  0,  0,  0]),
  tensor(2),
  tensor(0),
  tensor(1)),
 (tensor([ 2, 10,  4, 18, 25,  6, 36,  0,  0,  0]),
  tensor(0),
  tensor(0),
  tensor(1)),
 (tensor([ 2,  5, 49, 37,  3,  9,  0,  0,  0,  0]),
  tensor(1),
  tensor(0),
  tensor(0)),
 (tensor([ 2, 22, 19,  7, 44,  4, 38,  0,  0,  0]),
  tensor(1),
  tensor(0),
  tensor(0)),
 (tensor([ 2,  6, 10, 51, 19, 17, 40,  0,  0,  0]),
  tensor(1),
  tensor(0),
  tensor(0)),
 (tensor([ 2,  5, 38, 59,  3, 44,  0,  0,  0,  0]),
  tensor(2),
  tensor(0),
  tensor(0)),
 (tensor([ 2, 20, 38, 10, 29,  3, 21,  0,  0,  0]),
  tensor(0),
  tensor(0),
  

In [None]:
# test collate_batch 
path="new_shape_dataset.csv"
df=pd.read_csv(path)
k=df[:5]
print(k)
dat=multi_dataset(path, vocab=None)
c=collate_batch(k)
print(c)

                                sentence  type  size  stiffness
0         this marble is giant and stiff     3     2          2
1      a soft round object was miniature     3     1          1
2  bulky and flexible describes the tube     1     2          1
3       the canister was solid and large     1     2          2
4          the little globe looked stiff     3     1          2


TypeError: expected Tensor as element 0 in argument 0, but got str

In [17]:

def train_epoch(model, dataloader, optim, device):
    model.train()
    total_loss = 0.0
    total_acc_shape = 0.0
    total_acc_size = 0.0
    total_acc_stiff = 0.0
    criterion = nn.CrossEntropyLoss()
    for encs, pad_mask, shp, sze, stiff in dataloader:
        encs = encs.to(device)
        pad_mask = pad_mask.to(device)
        shp = shp.to(device)
        sze = sze.to(device)
        stiff = stiff.to(device)
        # Check with PyTorch documentation
        optim.zero_grad()
        out_shape, out_size, out_stiff = model(encs, src_key_padding_mask=pad_mask)
        loss_shape = criterion(out_shape, shp)
        loss_size = criterion(out_size, sze)
        loss_stiff = criterion(out_stiff, stiff)
        loss = loss_shape + loss_size + loss_stiff
        loss.backward()
        optim.step()
        total_loss += loss.item() * encs.size(0)
        total_acc_shape += accuracy(out_shape.detach().cpu(), shp.detach().cpu()) * encs.size(0)
        total_acc_size += accuracy(out_size.detach().cpu(), sze.detach().cpu()) * encs.size(0)
        total_acc_stiff += accuracy(out_stiff.detach().cpu(), stiff.detach().cpu()) * encs.size(0)

    n = len(dataloader.dataset)
    return total_loss / n, total_acc_shape / n, total_acc_size / n, total_acc_stiff / n

In [18]:
#hi

def eval_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0.0
    total_acc_shape = 0.0
    total_acc_size = 0.0
    total_acc_stiff = 0.0
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for encs, pad_mask, shp, size, stiff in dataloader:
            encs = encs.to(device)
            pad_mask = pad_mask.to(device)
            shp = shp.to(device)
            size = size.to(device)
            stiff = stiff.to(device)
            out_shape, out_size, out_stiff = model(encs, src_key_padding_mask=pad_mask)
            loss_shape = criterion(out_shape, shp)
            loss_size = criterion(out_size, size)
            loss_stiff = criterion(out_stiff, stiff)
            loss = loss_shape + loss_size + loss_stiff
            total_loss += loss.item() * encs.size(0)
            total_acc_shape += accuracy(out_shape.cpu(), shp.cpu()) * encs.size(0)
            total_acc_size += accuracy(out_size.cpu(), size.cpu()) * encs.size(0)
            total_acc_stiff += accuracy(out_stiff.cpu(), stiff.cpu()) * encs.size(0)
    n = len(dataloader.dataset)
    return total_loss / n, total_acc_shape / n, total_acc_size / n, total_acc_stiff / n

In [46]:

# -------------------------
# Main: parse args and run
# -------------------------
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Build vocab from training file
    vocab = SimpleVocab(min_freq=1)
    # first pass build vocab
    # with open(args.data_path, "r", encoding="utf-8") as f:
    #     reader = csv.DictReader(f)
    #     for r in reader:
    #         toks = simple_tokenize(r["text"].strip())
    #         vocab.add_sentence(toks)
    # vocab.build_vocab()
    # print("Vocab size:", len(vocab))

    dataset = MultiTaskTextDataset(args.data_path, vocab=None, max_len=args.max_len)
    # split
    train_size = int(len(dataset) * 0.8)
    val_size = len(dataset) - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

    train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_batch)

    # what does this do ?
    # create model
    # num_shape = len(dataset.dataset.label_maps["shape_status"]) if isinstance(dataset, torch.utils.data.Subset) else len(dataset.label_maps["shape_status"])
    # num_size = len(dataset.dataset.label_maps["size_size"]) if isinstance(dataset, torch.utils.data.Subset) else len(dataset.label_maps["size_size"])
    # num_stiffness = len(dataset.dataset.label_maps["stiffness"]) if isinstance(dataset, torch.utils.data.Subset) else len(dataset.label_maps["stiffness"])
    # above logic handles Subset wrappers

    model = VanillaTransformerMultiTask(
        vocab_size=dataset.get_vocab_size(),
        d_model=args.d_model,
        nhead=args.nhead,
        num_layers=args.num_layers,
        hidden_dim=args.hidden_dim,
        num_shape=3, # no of output heads 
        num_size=2,
        num_stiff=2,
        max_len=args.max_len,
        dropout=0.1
    ).to(device)

    optim = torch.optim.Adam(model.parameters(), lr=args.lr)

    # training loop
    best_val_loss = float("inf") 
    # why is this
    for epoch in range(1, args.epochs + 1):
        print(f"Epoch {epoch}/{args.epochs}")
        train_loss, train_acc_w, train_acc_b, train_acc_g = train_epoch(model, train_loader, optim, device)
        val_loss, val_acc_w, val_acc_b, val_acc_g = eval_epoch(model, val_loader, device)
        print(f"Epoch {epoch} | Train loss {train_loss:.4f} | Val loss {val_loss:.4f}")
        print(f"  Train acc (shape/size/stiffness): {train_acc_w:.3f}/{train_acc_b:.3f}/{train_acc_g:.3f}")
        print(f"  Val   acc (shape/size/stiffness): {val_acc_w:.3f}/{val_acc_b:.3f}/{val_acc_g:.3f}")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                "model_state": model.state_dict(),
                "vocab": dataset.vocab.token_to_idx,
                # "label_maps": {
                #     "shape_status": dataset.dataset.label_maps["shape_status"] if isinstance(dataset, torch.utils.data.Subset) else dataset.label_maps["shape_status"],
                #     "size_size": dataset.dataset.label_maps["size_size"] if isinstance(dataset, torch.utils.data.Subset) else dataset.label_maps["size_size"],
                #     "stiffness": dataset.dataset.label_maps["stiffness"] if isinstance(dataset, torch.utils.data.Subset) else dataset.label_maps["stiffness"],
                # }
            }, args.save_path)
            print("Saved best model.")

    # sample inference
    model.eval()
    # load maps for convenience
    # label_maps = {
    #     "shape_status": (dataset.dataset.label_maps["shape_status"] if isinstance(dataset, torch.utils.data.Subset) else dataset.label_maps["shape_status"]),
    #     "size_size": (dataset.dataset.label_maps["size_size"] if isinstance(dataset, torch.utils.data.Subset) else dataset.label_maps["size_size"]),
    #     "stiffness": (dataset.dataset.label_maps["stiffness"] if isinstance(dataset, torch.utils.data.Subset) else dataset.label_maps["stiffness"]),
    # }
    # inv_maps = {k: {v: kk for kk, v in label_maps[k].items()} for k in label_maps}

    # def predict_text(text):
    #     toks = simple_tokenize(text)
    #     enc = vocab.encode(toks)
    #     cls_idx = vocab.token_to_idx["<cls>"]
    #     enc = [cls_idx] + enc
    #     if len(enc) < args.max_len:
    #         enc = enc + [vocab.token_to_idx["<pad>"]] * (args.max_len - len(enc))
    #     else:
    #         enc = enc[: args.max_len]
    #     enc_t = torch.tensor([enc], dtype=torch.long).to(device)
    #     pad_mask = (enc_t == vocab.token_to_idx["<pad>"])
    #     with torch.no_grad():
    #         out_w, out_b, out_g = model(enc_t, src_key_padding_mask=pad_mask)
    #         pw = out_w.argmax(dim=1).item()
    #         pb = out_b.argmax(dim=1).item()
    #         pg = out_g.argmax(dim=1).item()
    #     return inv_maps["shape_status"].get(pw, "UNK"), inv_maps["size_size"].get(pb, "UNK"), inv_maps["stiffness"].get(pg, "UNK")

    # quick demos 
    # samples = [
    #     "a tall girl who is poor was going to the market",
    #     "fat man is rich",
    #     "thin woman is poor"
    # ]
    # print("\nSample predictions:")
    # for s in samples:
    #     print(s, "->", predict_text(s))

from types import SimpleNamespace

# Manually set all args as attributes
args = SimpleNamespace(
    data_path="new_shape_dataset.csv",   # path to your CSV
    max_len=32,
    d_model=128,
    nhead=4,
    num_layers=2,
    hidden_dim=256,
    batch_size=16,
    epochs=10,
    lr=1e-3,
    save_path="best_vanilla_transformer.pt"
)

main(args)




Epoch 1/10
Epoch 1 | Train loss 2.7208 | Val loss 2.5230
  Train acc (shape/size/stiffness): 0.348/0.516/0.540
  Val   acc (shape/size/stiffness): 0.333/0.532/0.577
Saved best model.
Epoch 2/10
Epoch 2 | Train loss 2.4789 | Val loss 1.9142
  Train acc (shape/size/stiffness): 0.411/0.600/0.567
  Val   acc (shape/size/stiffness): 0.514/0.797/0.824
Saved best model.
Epoch 3/10
Epoch 3 | Train loss 1.6932 | Val loss 1.3416
  Train acc (shape/size/stiffness): 0.536/0.845/0.828
  Val   acc (shape/size/stiffness): 0.595/0.865/0.838
Saved best model.
Epoch 4/10
Epoch 4 | Train loss 0.9964 | Val loss 0.5172
  Train acc (shape/size/stiffness): 0.719/0.947/0.921
  Val   acc (shape/size/stiffness): 0.851/0.991/0.977
Saved best model.
Epoch 5/10
Epoch 5 | Train loss 0.4293 | Val loss 0.2182
  Train acc (shape/size/stiffness): 0.901/0.985/0.971
  Val   acc (shape/size/stiffness): 0.932/1.000/0.995
Saved best model.
Epoch 6/10
Epoch 6 | Train loss 0.1889 | Val loss 0.1073
  Train acc (shape/size/stif

In [None]:
# make predict text function

In [None]:
output = torch.randn(5, 3)  # batch_size=5, num_classes=3
target = torch.tensor([0, 1, 2, 3, 1])  # <-- 3 is invalid!
# loss = nn.CrossEntropyLoss()(output, target)  # IndexError!