In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset, random_split, ConcatDataset, Dataset
from tqdm import tqdm

from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer, BertModel

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [3]:
# Load dataset
dataset = load_dataset('dair-ai/emotion',trust_remote_code=True)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
dataset['train'][1]

{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'label': 0}

Architecture

d_model or d_emb is embeding dimention or vector dimention

d_k dimention of key or quary or value (all three are same)

In [6]:
class selfAttention(nn.Module):
    def __init__(self, emb_size):
        super(selfAttention,self).__init__()
        self.emb_size=emb_size
        self.w_k=nn.Linear(emb_size,emb_size)
        self.w_q=nn.Linear(emb_size,emb_size)
        self.w_v=nn.Linear(emb_size,emb_size)
        self.out=nn.Linear(emb_size,emb_size)


    def forward(self,k,q,v,mask):
        K=self.w_k(k)
        Q=self.w_q(q)
        V=self.w_v(v)
        attention=(torch.matmul(Q,K.transpose(-2,-1)))/torch.tensor(self.emb_size**0.5)

        if mask is not None:
            attention.masked_fill_(mask==0, -1e6)

        attention_scores=F.softmax(attention, dim=-1)
        output=torch.matmul(attention_scores,V)
        output=self.out(output)

        return output

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size, heads):
        super(MultiHeadAttention,self).__init__()
        self.heads=heads
        self.emb_size=emb_size
        self.head_dim=self.emb_size//self.heads
        self.w_k=nn.Linear(emb_size,emb_size)
        self.w_q=nn.Linear(emb_size,emb_size)
        self.w_v=nn.Linear(emb_size,emb_size)
        self.out=nn.Linear(emb_size,emb_size)

        assert(self.head_dim * heads == emb_size),"embeding size is not divisible by number of heads"


    def forward(self,k,q,v,mask=None):
        N=q.shape[0]  # batch size
        K=self.w_k(k)
        Q=self.w_q(q)
        V=self.w_v(v)

        K=K.view(N,K.shape[1],self.heads,self.head_dim).transpose(1,2)    # (batch size, sequence len, heads, head dimention)
        Q=Q.view(N,Q.shape[1],self.heads,self.head_dim).transpose(1,2)    # transposed to give(batch size, heads, sequence len, head dimention)
        V=V.view(N,V.shape[1],self.heads,self.head_dim).transpose(1,2)

        attention=(torch.matmul(Q,K.transpose(-2,-1)))/torch.tensor(self.head_dim**0.5)
        
        if mask is not None:
            mask=mask.reshape(-1,1,1,128)
            attention.masked_fill_(mask==0, -1e9)

        attention_scores=F.softmax(attention, dim=-1)
        output=torch.matmul(attention_scores,V)
        output = output.transpose(1, 2).reshape(N, -1, self.emb_size)
        output=self.out(output)

        return output

In [8]:
def pos_embedding(seq_len, emb_size, n=10000):
    P = np.zeros((seq_len, emb_size))
    for pos in range(seq_len):
        for i in range(emb_size // 2):
            denominator = np.power(n, 2 * i / emb_size)
            P[pos, 2 * i] = np.sin(pos / denominator)
            P[pos, 2 * i + 1] = np.cos(pos / denominator)
    return torch.tensor(P, dtype=torch.float32)

In [9]:
class Encoder(nn.Module):
    def __init__(self, heads, emb_size):
        super(Encoder, self).__init__()
        self.mha=MultiHeadAttention(emb_size, heads)
        self.ff1=nn.Linear(emb_size,2*emb_size)
        self.ff2=nn.Linear(2*emb_size, emb_size)
        self.norm1=nn.LayerNorm(emb_size)
        self.norm2=nn.LayerNorm(emb_size)
        self.dropout=nn.Dropout(p=0.2)

    def forward(self, x, mask=None):
        attention_out=self.mha(x,x,x,mask)
        attention_out = self.dropout(attention_out)
        out1=self.norm1(x+attention_out)

        ff_out=F.relu(self.ff1(out1))
        ff_out=self.ff2(ff_out)
        out2=self.dropout(ff_out)
        encoder_out=self.norm2(out1+out2)
        return encoder_out

In [10]:
class Decoder(nn.Module):
    def __init__(self, heads, emb_size):
        super(Decoder, self).__init__()
        self.mmha=MultiHeadAttention(emb_size, heads)
        self.mha=MultiHeadAttention(emb_size, heads)
        self.ff1=nn.Linear(emb_size,2*emb_size)
        self.ff2=nn.Linear(2*emb_size, emb_size)
        self.norm1=nn.LayerNorm(emb_size)
        self.norm2=nn.LayerNorm(emb_size)
        self.norm3=nn.LayerNorm(emb_size)
        self.dropout=nn.Dropout(p=0.2)

    def forward(self, x, encoder_out, source_mask, target_mask):
        mask_attention_out=self.mmha(x,x,x,target_mask)
        mask_attention_out=self.dropout(mask_attention_out)
        out1=self.norm1(x+mask_attention_out)

        enc_dec_attention_out=self.mha(encoder_out,out1,encoder_out)
        enc_dec_attention_out=self.dropout(enc_dec_attention_out)
        out2=self.norm2(out1+enc_dec_attention_out)

        ff_output=F.relu(self.ff1(out2))
        ff_output=self.ff2(ff_output)
        ff_output=self.dropout(ff_output)
        out3=self.norm3(out2+ff_output)

        return out3

In [11]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, input_dim, emb_size, num_encoder_layers, num_decoder_layers, heads, seq_len, out_class):
        super(Transformer, self).__init__()
        
        self.embedding=nn.Embedding(vocab_size, emb_size)
        self.encoder_layers = nn.ModuleList([Encoder(heads, emb_size) for _ in range(num_encoder_layers)])
        self.decoder_layers = nn.ModuleList([Decoder(heads, emb_size) for _ in range(num_decoder_layers)])
        self.position_encodings = pos_embedding(seq_len, emb_size)
        self.linear = nn.Linear(emb_size, out_class)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):

        src=self.embedding(src)   # need to do *sqrt(d_model)
        tgt=self.embedding(tgt)   # for transulation task give different embedding

        for encoder in self.encoder_layers:
            src = encoder(src, src_mask)
        
        for decoder in self.decoder_layers:
            tgt = decoder(tgt, src, src_mask, tgt_mask)

        output = self.linear(tgt[:, -1, :])
        output = F.softmax(output, dim=-1)

        return output
    


input_dim = 1000
emb_size = 512
heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
num_classes = 6


In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
'''# Preprocess data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

# Split dataset
train_data = dataset['train'].map(preprocess_function, batched=True)
test_data = dataset['test'].map(preprocess_function, batched=True)

# Create data loaders
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=16)
'''


"# Preprocess data\ndef preprocess_function(examples):\n    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)\n\n# Split dataset\ntrain_data = dataset['train'].map(preprocess_function, batched=True)\ntest_data = dataset['test'].map(preprocess_function, batched=True)\n\n# Create data loaders\ntrain_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)\ntest_dataloader = DataLoader(test_data, batch_size=16)\n"

In [14]:
'''class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item['text'], 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)  # Remove batch dimension
        label = torch.tensor(item['label'], dtype=torch.long)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': labels}'''



"class CustomDataset(Dataset):\n    def __init__(self, data, tokenizer, max_length):\n        self.data = data\n        self.tokenizer = tokenizer\n        self.max_length = max_length\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, idx):\n        item = self.data[idx]\n        encoding = self.tokenizer(\n            item['text'], \n            padding='max_length', \n            truncation=True, \n            max_length=self.max_length, \n            return_tensors='pt'\n        )\n        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension\n        attention_mask = encoding['attention_mask'].squeeze(0)  # Remove batch dimension\n        label = torch.tensor(item['label'], dtype=torch.long)\n        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}\n\ndef collate_fn(batch):\n    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0)\n    attention_mask =

In [15]:
'''tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
max_length = 128  # Or any other desired max length

train_dataset = CustomDataset(train_data, tokenizer, max_length)
test_dataset = CustomDataset(test_data, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)'''


"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\nmax_length = 128  # Or any other desired max length\n\ntrain_dataset = CustomDataset(train_data, tokenizer, max_length)\ntest_dataset = CustomDataset(test_data, tokenizer, max_length)\n\ntrain_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)\ntest_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)"

In [16]:
'''model = Transformer(input_dim, emb_size, num_encoder_layers, num_decoder_layers, heads, seq_len=128, out_class=num_classes)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)'''

'model = Transformer(input_dim, emb_size, num_encoder_layers, num_decoder_layers, heads, seq_len=128, out_class=num_classes)\nloss_fn = nn.CrossEntropyLoss()\noptimizer = optim.Adam(model.parameters(), lr=3e-5)'

In [17]:
'''from prettytable import PrettyTable
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
count_parameters(model)'''

'from prettytable import PrettyTable\ndef count_parameters(model):\n    table = PrettyTable(["Modules", "Parameters"])\n    total_params = 0\n    for name, parameter in model.named_parameters():\n        if not parameter.requires_grad: continue\n        params = parameter.numel()\n        table.add_row([name, params])\n        total_params+=params\n    print(table)\n    print(f"Total Trainable Params: {total_params}")\n    return total_params\ncount_parameters(model)'

In [18]:
'''# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(batch['input_ids'], batch['attention_mask'])
        loss = loss_fn(outputs, batch['label'])
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in test_dataloader:
            outputs = model(batch['input_ids'], batch['attention_mask'])
            _, predicted = torch.max(outputs, dim=1)
            total += batch['label'].size(0)
            correct += (predicted == batch['label']).sum().item()

    accuracy = correct / total
    print(f'Epoch {epoch + 1}, Accuracy: {accuracy:.4f}')'''


"# Training loop\nnum_epochs = 3\nfor epoch in range(num_epochs):\n    model.train()\n    for batch in train_dataloader:\n        optimizer.zero_grad()\n        outputs = model(batch['input_ids'], batch['attention_mask'])\n        loss = loss_fn(outputs, batch['label'])\n        loss.backward()\n        optimizer.step()\n\n    # Evaluation\n    model.eval()\n    total, correct = 0, 0\n    with torch.no_grad():\n        for batch in test_dataloader:\n            outputs = model(batch['input_ids'], batch['attention_mask'])\n            _, predicted = torch.max(outputs, dim=1)\n            total += batch['label'].size(0)\n            correct += (predicted == batch['label']).sum().item()\n\n    accuracy = correct / total\n    print(f'Epoch {epoch + 1}, Accuracy: {accuracy:.4f}')"

In [27]:
dataset['train'][1]

{'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'label': 0}

In [19]:
# Preprocess data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Split dataset
train_data = dataset['train'].map(preprocess_function, batched=True) 
test_data = dataset['test'].map(preprocess_function, batched=True)


from torch.utils.data import DataLoader, Dataset

class EmotionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.data[idx]['input_ids']),
            'attention_mask': torch.tensor(self.data[idx]['attention_mask']),
            'label': torch.tensor(self.data[idx]['label'])
        }

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': labels}

train_dataset = EmotionDataset(train_data)
test_dataset = EmotionDataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)#, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16)#, collate_fn=collate_fn)

model = Transformer(tokenizer.vocab_size, input_dim, emb_size, num_encoder_layers, num_decoder_layers, heads, seq_len=128, out_class=num_classes)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)

if torch.cuda.is_available(): 
    model = model.to('cuda')    #transfer model to GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model

Transformer(
  (embedding): Embedding(30522, 512)
  (encoder_layers): ModuleList(
    (0-5): 6 x Encoder(
      (mha): MultiHeadAttention(
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (out): Linear(in_features=512, out_features=512, bias=True)
      )
      (ff1): Linear(in_features=512, out_features=1024, bias=True)
      (ff2): Linear(in_features=1024, out_features=512, bias=True)
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x Decoder(
      (mmha): MultiHeadAttention(
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_f

In [20]:
tokenizer.vocab_size

30522

In [43]:
example={'text': 'im sad'}
ex=preprocess_function(example)
ex=EmotionDataset(example)
exa=DataLoader(test_dataset,batch_size=1)
first_item1=next(iter(exa))
if torch.cuda.is_available():
    first_item1['input_ids'], first_item1['input_ids'], first_item1['attention_mask']=first_item1['input_ids'].to(device), first_item1['input_ids'].to(device), first_item1['attention_mask'].to(device)
model(first_item1['input_ids'], first_item1['input_ids'], first_item1['attention_mask'])

tensor([[9.9999e-01, 2.3023e-06, 3.4305e-06, 3.9553e-06, 2.5020e-07, 3.8796e-06]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [39]:
first_item = next(iter(train_dataloader))
if torch.cuda.is_available():
    first_item['input_ids'], first_item['input_ids'], first_item['attention_mask']=first_item['input_ids'].to(device), first_item['input_ids'].to(device), first_item['attention_mask'].to(device)
model.eval()
model(first_item['input_ids'], first_item['input_ids'], first_item['attention_mask'])

tensor([[5.6345e-02, 1.1440e-03, 1.7689e-04, 5.7809e-01, 3.6413e-01, 1.1578e-04],
        [9.9998e-01, 2.9373e-06, 3.7329e-06, 3.9547e-06, 2.2876e-07, 4.2140e-06],
        [9.9998e-01, 1.0697e-06, 2.6520e-06, 1.0821e-05, 4.1099e-07, 2.8447e-06],
        [9.9998e-01, 4.6575e-06, 3.9846e-06, 2.8538e-06, 2.1702e-07, 4.4364e-06],
        [5.7908e-01, 2.4149e-05, 7.7338e-05, 4.1753e-01, 3.2243e-03, 6.4226e-05],
        [1.4327e-05, 9.9998e-01, 2.7647e-06, 3.0323e-07, 3.1619e-06, 1.8826e-06],
        [9.9999e-01, 1.9794e-06, 3.3971e-06, 5.2014e-06, 2.4658e-07, 3.9246e-06],
        [1.6480e-05, 9.9998e-01, 2.8080e-06, 3.2522e-07, 2.7632e-06, 1.8699e-06],
        [1.7811e-05, 9.9997e-01, 3.3069e-06, 3.4454e-07, 2.3666e-06, 2.3356e-06],
        [7.5162e-06, 9.9998e-01, 1.6360e-06, 3.7525e-07, 4.4812e-06, 1.1284e-06],
        [9.9998e-01, 1.5245e-06, 3.2256e-06, 6.8325e-06, 2.7111e-07, 3.4376e-06],
        [9.9999e-01, 2.5351e-06, 3.3165e-06, 4.8161e-06, 2.3028e-07, 3.7619e-06],
        [9.9998e

In [22]:
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        if torch.cuda.is_available():
            batch['input_ids'], batch['input_ids'], batch['attention_mask'] , batch['label'] = batch['input_ids'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
        outputs = model(batch['input_ids'], tgt=batch['input_ids'], src_mask=batch['attention_mask'])
        loss = loss_fn(outputs, batch['label'])
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            if torch.cuda.is_available():
                batch['input_ids'], batch['input_ids'], batch['attention_mask'] , batch['label'] = batch['input_ids'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
            outputs = model(batch['input_ids'], tgt=batch['input_ids'], src_mask=batch['attention_mask'])
            _, predicted = torch.max(outputs, dim=1)
            total += batch['label'].size(0)
            correct += (predicted == batch['label']).sum().item()

    accuracy = correct / total
    print(f'Epoch {epoch + 1}, Accuracy: {accuracy:.4f}')

100%|██████████| 1000/1000 [05:58<00:00,  2.79it/s]
100%|██████████| 125/125 [00:18<00:00,  6.94it/s]


Epoch 1, Accuracy: 0.5295


100%|██████████| 1000/1000 [06:07<00:00,  2.72it/s]
100%|██████████| 125/125 [00:16<00:00,  7.70it/s]


Epoch 2, Accuracy: 0.5960


100%|██████████| 1000/1000 [05:42<00:00,  2.92it/s]
100%|██████████| 125/125 [00:18<00:00,  6.77it/s]

Epoch 3, Accuracy: 0.6810





In [24]:
val_data = dataset['validation'].map(preprocess_function, batched=True)
val_dataset = EmotionDataset(val_data)
val_dataloader = DataLoader(val_dataset, batch_size=16)#, collate_fn=collate_fn)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [25]:
model.eval()
total, correct = 0, 0
with torch.no_grad():
    for batch in tqdm(val_dataloader):
        if torch.cuda.is_available():
            batch['input_ids'], batch['input_ids'], batch['attention_mask'] , batch['label'] = batch['input_ids'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
        outputs = model(batch['input_ids'], tgt=batch['input_ids'], src_mask=batch['attention_mask'])
        _, predicted = torch.max(outputs, dim=1)
        total += batch['label'].size(0)
        correct += (predicted == batch['label']).sum().item()

accuracy = correct / total
print(f'Epoch {epoch + 1}, Accuracy: {accuracy:.4f}')

100%|██████████| 125/125 [00:17<00:00,  7.14it/s]

Epoch 3, Accuracy: 0.6500





In [23]:
# Display a sample before and after preprocessing
sample = dataset['train'][2]
print("Original Sample:", sample)

# Apply preprocessing
processed_sample = preprocess_function(sample)
print("Processed Sample:", processed_sample)

Original Sample: {'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3}
Processed Sample: {'input_ids': [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

final test
im aging very fast   fear or sadness