In [33]:
import pandas as pd
df = pd.read_csv('train.tsv', sep='\t', header=0, quoting=3)
device='cuda'
#device='cpu'

In [34]:
df.head(5)

Unnamed: 0,id,text,label
0,eng_train0,I supported Barack Obama. I thought it was abs...,0
1,eng_train1,what to hell with that!,1
2,eng_train2,"and the stupidity of the haters continues, thi...",1
3,eng_train3,Alberta has been in debt under the Conservativ...,0
4,eng_train4,"The TV is in Channel Search mode, and I have p...",0


In [35]:
import nltk
from nltk.tokenize import wordpunct_tokenize
nltk.download('punkt', quiet=True)

df['tokens'] = df['text'].apply(lambda x: [token.lower() for token in wordpunct_tokenize(x)])

In [36]:
df.head()
print(df['label'].value_counts())

label
0    62530
1    36470
Name: count, dtype: int64


In [37]:
from collections import Counter

# Step 1: Build a vocabulary
all_tokens = [token for tokens in df['tokens'] for token in tokens]
vocab = {token: idx+2 for idx, (token, _) in enumerate(Counter(all_tokens).items())}  # idx+2 to reserve 0 for padding and 1 for unknown
vocab['<PAD>'] = 0  # Padding token
vocab['<UNK>'] = 1  # Unknown token

# Step 2: Convert tokens into indices
def tokens_to_indices(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

df['token_indices'] = df['tokens'].apply(tokens_to_indices)

In [38]:
df.head(5)

Unnamed: 0,id,text,label,tokens,token_indices
0,eng_train0,I supported Barack Obama. I thought it was abs...,0,"[i, supported, barack, obama, ., i, thought, i...","[2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 11, 12, 13, 14..."
1,eng_train1,what to hell with that!,1,"[what, to, hell, with, that, !]","[84, 38, 85, 86, 13, 43]"
2,eng_train2,"and the stupidity of the haters continues, thi...",1,"[and, the, stupidity, of, the, haters, continu...","[11, 87, 88, 35, 87, 89, 90, 21, 91, 92, 93, 9..."
3,eng_train3,Alberta has been in debt under the Conservativ...,0,"[alberta, has, been, in, debt, under, the, con...","[112, 113, 114, 115, 116, 117, 87, 14, 21, 11,..."
4,eng_train4,"The TV is in Channel Search mode, and I have p...",0,"[the, tv, is, in, channel, search, mode, ,, an...","[87, 129, 94, 115, 130, 131, 132, 21, 11, 2, 1..."


## Transformer

In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
class TramsformerDataset(Dataset):
    def __init__(self, df, vocab):
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
        self.token_indices = [torch.tensor(indices, dtype=torch.long) for indices in df["token_indices"].values]
        self.max_length = max(len(indices) for indices in self.token_indices) if self.token_indices else 0
        self.vocab = vocab

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        indices = self.token_indices[idx]
        if len(indices) < self.max_length:
            padding = torch.zeros(self.max_length - len(indices), dtype=torch.long)
            padded_indices = torch.cat([indices, padding])
        else:
            padded_indices = indices[:self.max_length]
        return padded_indices, self.labels[idx]

def collate_fn_transformer(batch):
    indices, labels = zip(*batch)
    padded_indices = torch.stack(indices)
    labels = torch.stack(labels)
    mask = (padded_indices != vocab['<PAD>']).long()
    return padded_indices, labels,mask

class_counts = df['label'].value_counts()
total_samples = len(df)
class_weights = torch.tensor([total_samples / (len(class_counts) * count) for count in class_counts])
sample_weights = df['label'].map(lambda label: class_weights[label]).tolist()
sampler = torch.utils.data.WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)


dataset = TramsformerDataset(df, vocab)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn_transformer, sampler=sampler)

# Iterate through the DataLoader to check the output
for padded_embeddings, labels,mask in dataloader:
    print(padded_embeddings.shape)
    print("Mask Shape:", mask.shape)
    print(labels)
    break

torch.Size([64, 745])
Mask Shape: torch.Size([64, 745])
tensor([0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1])


In [48]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim, n_heads=10, hidden_size=64, dropout=0.1):
        super().__init__()
        if embedding_dim % n_heads != 0:
            raise ValueError("embedding_dim must be divisible by n_heads")
        self.attention=nn.MultiheadAttention(embedding_dim,n_heads,batch_first=True)
        self.feedforward=nn.Sequential(
            nn.Linear(embedding_dim,hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size,hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size,embedding_dim)
        )
        self.norm=nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x,mask):
        key_padding_mask = (mask == 0)
        attention,_=self.attention(x,x,x,key_padding_mask=key_padding_mask )
        attention_skip=self.dropout(attention)+x
        x=self.norm(attention_skip)
        out=self.feedforward(x)
        out=self.norm(out+x)
        return(out)

class Encoder_Only_Transformer(nn.Module):
    def __init__(self,vocab_size, embedding_dim, n_heads=10, hidden_size=64, dropout=0.1,num_classes=2,num_blocks=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        self.feedforward_classification=nn.Sequential(
                  nn.Linear(embedding_dim,hidden_size),
                  nn.ReLU(),
                  nn.Linear(hidden_size,num_classes)
              )
        self.blocks = nn.ModuleList([
            Encoder(embedding_dim, n_heads, hidden_size, dropout) for _ in range(num_blocks)
        ])
        self.feedforward_classification = nn.Sequential(
            nn.Linear(embedding_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes)
        )
    def forward(self, x, mask):
        x=self.embedding(x)
        for block in self.blocks:
            x=block(x, mask)
        x=x.mean(dim=1)
        out = self.feedforward_classification(x)
        return out

In [41]:
!pip install tqdm



In [49]:
from tqdm import tqdm
vocab_size = len(vocab)
embedding_dim = 100
encoder = Encoder_Only_Transformer(vocab_size, embedding_dim).to(device)
EPOCHS = 50
num_batches=100
optimizer = optim.Adam(encoder.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
for epoch in range(EPOCHS):
    avg_loss = 0
    for i, (embeddings, labels, mask) in tqdm(enumerate(dataloader), total=num_batches):
        if i >= num_batches:
          break
        optimizer.zero_grad()
        embeddings=embeddings.to(device)
        labels=labels.to(device)
        mask=mask.to(device)
        logits = encoder(embeddings, mask.bool())
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()
    print(f"Epoch: {epoch+1}, Loss: {avg_loss / num_batches}")
torch.save(encoder.state_dict(), 'encoder_transformer_model.pth')

100%|██████████| 100/100 [00:35<00:00,  2.78it/s]


Epoch: 1, Loss: 0.6494885486364365


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 2, Loss: 0.5682186102867126


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 3, Loss: 0.4375973942875862


100%|██████████| 100/100 [00:35<00:00,  2.79it/s]


Epoch: 4, Loss: 0.370524537563324


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 5, Loss: 0.32430401638150214


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 6, Loss: 0.2962942272424698


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 7, Loss: 0.29092088252305986


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 8, Loss: 0.2625061845034361


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 9, Loss: 0.2489636876434088


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 10, Loss: 0.23579737462103367


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 11, Loss: 0.24580685257911683


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 12, Loss: 0.23206547804176808


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 13, Loss: 0.22170166790485382


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 14, Loss: 0.2127931197732687


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 15, Loss: 0.19177718430757523


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 16, Loss: 0.20085165455937384


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 17, Loss: 0.20336362637579442


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 18, Loss: 0.18822205655276775


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 19, Loss: 0.17815873440355062


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 20, Loss: 0.19025542959570885


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 21, Loss: 0.1817887732759118


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 22, Loss: 0.17660594649612904


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 23, Loss: 0.167880953643471


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 24, Loss: 0.16866142474114895


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 25, Loss: 0.16087500609457492


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 26, Loss: 0.15115550868213176


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 27, Loss: 0.15782377623021604


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 28, Loss: 0.15137574166059495


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 29, Loss: 0.15493139423429966


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 30, Loss: 0.1536564962938428


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 31, Loss: 0.15634785424917935


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 32, Loss: 0.14032429784536363


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 33, Loss: 0.14023678414523602


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 34, Loss: 0.13685581875965


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 35, Loss: 0.14741635143756868


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 36, Loss: 0.136643044129014


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 37, Loss: 0.13332249145954847


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 38, Loss: 0.12722919568419455


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 39, Loss: 0.12977641815319657


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 40, Loss: 0.13570523481816055


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 41, Loss: 0.13227462463080883


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 42, Loss: 0.13632904279977084


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 43, Loss: 0.12491402413696051


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 44, Loss: 0.12304112985730171


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 45, Loss: 0.1295125909894705


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 46, Loss: 0.11279180373996496


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 47, Loss: 0.1232259907014668


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 48, Loss: 0.11898355409502984


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Epoch: 49, Loss: 0.10881261564791203


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]

Epoch: 50, Loss: 0.1104614345356822





In [50]:
dev_df = pd.read_csv('dev.tsv', sep='\t', header=0, quoting=3)
dev_df['tokens'] = dev_df['text'].apply(lambda x: [token.lower() for token in wordpunct_tokenize(x)])
dev_df['token_indices'] = dev_df['tokens'].apply(tokens_to_indices)
dataset=TramsformerDataset(dev_df, vocab)
dev_dataloader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn_transformer)
test_model=Encoder_Only_Transformer(vocab_size, embedding_dim).to(device)
#test_model.load_state_dict(torch.load('encoder_transformer_model.pth', map_location=torch.device('cpu')))
test_model.load_state_dict(torch.load('encoder_transformer_model.pth'))
test_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for embeddings, labels,mask in tqdm(dev_dataloader):
        embeddings=embeddings.to(device)
        labels=labels.to(device)
        mask=mask.to(device)
        logits = test_model(embeddings, mask)
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f"Test Accuracy: {accuracy}")

  test_model.load_state_dict(torch.load('encoder_transformer_model.pth'))
100%|██████████| 207/207 [00:13<00:00, 15.55it/s]

Test Accuracy: 0.8765151515151515



