In [1]:
import pandas as pd
df = pd.read_csv('train.tsv', sep='\t', header=0, quoting=3)
device='cpu'
#device='cpu'

In [2]:
df.head(5)

Unnamed: 0,id,text,label
0,eng_train0,I supported Barack Obama. I thought it was abs...,0
1,eng_train1,what to hell with that!,1
2,eng_train2,"and the stupidity of the haters continues, thi...",1
3,eng_train3,Alberta has been in debt under the Conservativ...,0
4,eng_train4,"The TV is in Channel Search mode, and I have p...",0


In [3]:
import nltk
from nltk.tokenize import wordpunct_tokenize
nltk.download('punkt', quiet=True)

df['tokens'] = df['text'].apply(lambda x: [token.lower() for token in wordpunct_tokenize(x)])

In [4]:
df.head()
print(df['label'].value_counts())

label
0    62530
1    36470
Name: count, dtype: int64


In [5]:
from collections import Counter

# Step 1: Build a vocabulary
all_tokens = [token for tokens in df['tokens'] for token in tokens]
vocab = {token: idx+2 for idx, (token, _) in enumerate(Counter(all_tokens).items())}  # idx+2 to reserve 0 for padding and 1 for unknown
vocab['<PAD>'] = 0  # Padding token
vocab['<UNK>'] = 1  # Unknown token

# Step 2: Convert tokens into indices
def tokens_to_indices(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

df['token_indices'] = df['tokens'].apply(tokens_to_indices)

In [6]:
df.head(5)

Unnamed: 0,id,text,label,tokens,token_indices
0,eng_train0,I supported Barack Obama. I thought it was abs...,0,"[i, supported, barack, obama, ., i, thought, i...","[2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 11, 12, 13, 14..."
1,eng_train1,what to hell with that!,1,"[what, to, hell, with, that, !]","[84, 38, 85, 86, 13, 43]"
2,eng_train2,"and the stupidity of the haters continues, thi...",1,"[and, the, stupidity, of, the, haters, continu...","[11, 87, 88, 35, 87, 89, 90, 21, 91, 92, 93, 9..."
3,eng_train3,Alberta has been in debt under the Conservativ...,0,"[alberta, has, been, in, debt, under, the, con...","[112, 113, 114, 115, 116, 117, 87, 14, 21, 11,..."
4,eng_train4,"The TV is in Channel Search mode, and I have p...",0,"[the, tv, is, in, channel, search, mode, ,, an...","[87, 129, 94, 115, 130, 131, 132, 21, 11, 2, 1..."


## Transformer

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
class TramsformerDataset(Dataset):
    def __init__(self, df, vocab):
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
        self.token_indices = [torch.tensor(indices, dtype=torch.long) for indices in df["token_indices"].values]
        self.max_length = max(len(indices) for indices in self.token_indices) if self.token_indices else 0
        self.vocab = vocab

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        indices = self.token_indices[idx]
        if len(indices) < self.max_length:
            padding = torch.zeros(self.max_length - len(indices), dtype=torch.long)
            padded_indices = torch.cat([indices, padding])
        else:
            padded_indices = indices[:self.max_length]
        return padded_indices, self.labels[idx]

def collate_fn_transformer(batch):
    indices, labels = zip(*batch)
    padded_indices = torch.stack(indices)
    labels = torch.stack(labels)
    mask = (padded_indices != vocab['<PAD>']).long()
    return padded_indices, labels,mask

class_counts = df['label'].value_counts()
total_samples = len(df)
class_weights = torch.tensor([total_samples / (len(class_counts) * count) for count in class_counts])
sample_weights = df['label'].map(lambda label: class_weights[label]).tolist()
sampler = torch.utils.data.WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)


dataset = TramsformerDataset(df, vocab)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn_transformer, sampler=sampler)

# Iterate through the DataLoader to check the output
for padded_embeddings, labels,mask in dataloader:
    print(padded_embeddings.shape)
    print("Mask Shape:", mask.shape)
    print(labels)
    break

torch.Size([64, 745])
Mask Shape: torch.Size([64, 745])
tensor([0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
        0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1])


In [8]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim, n_heads=10, hidden_size=64, dropout=0.1):
        super().__init__()
        if embedding_dim % n_heads != 0:
            raise ValueError("embedding_dim must be divisible by n_heads")
        self.attention=nn.MultiheadAttention(embedding_dim,n_heads,batch_first=True)
        self.feedforward=nn.Sequential(
            nn.Linear(embedding_dim,hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size,hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size,embedding_dim)
        )
        self.norm=nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x,mask):
        key_padding_mask = (mask == 0)
        attention,_=self.attention(x,x,x,key_padding_mask=key_padding_mask )
        attention_skip=self.dropout(attention)+x
        x=self.norm(attention_skip)
        out=self.feedforward(x)
        out=self.norm(out+x)
        return(out)

class Encoder_Only_Transformer(nn.Module):
    def __init__(self,vocab_size, embedding_dim, n_heads=10, hidden_size=64, dropout=0.1,num_classes=2,num_blocks=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        self.feedforward_classification=nn.Sequential(
                  nn.Linear(embedding_dim,hidden_size),
                  nn.ReLU(),
                  nn.Linear(hidden_size,num_classes)
              )
        self.blocks = nn.ModuleList([
            Encoder(embedding_dim, n_heads, hidden_size, dropout) for _ in range(num_blocks)
        ])
        self.feedforward_classification = nn.Sequential(
            nn.Linear(embedding_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes)
        )
    def forward(self, x, mask):
        x=self.embedding(x)
        for block in self.blocks:
            x=block(x, mask)
        x=x.mean(dim=1)
        out = self.feedforward_classification(x)
        return out

In [9]:
!pip install tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
from tqdm import tqdm
vocab_size = len(vocab)
embedding_dim = 100
encoder = Encoder_Only_Transformer(vocab_size, embedding_dim).to(device)
EPOCHS = 50
num_batches=100
optimizer = optim.Adam(encoder.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
skip_training = False

if not skip_training:
  for epoch in range(EPOCHS):
      avg_loss = 0
      for i, (embeddings, labels, mask) in tqdm(enumerate(dataloader), total=num_batches):
          if i >= num_batches:
            break
          optimizer.zero_grad()
          embeddings=embeddings.to(device)
          labels=labels.to(device)
          mask=mask.to(device)
          logits = encoder(embeddings, mask.bool())
          loss = criterion(logits, labels)
          loss.backward()
          optimizer.step()
          avg_loss += loss.item()
      print(f"Epoch: {epoch+1}, Loss: {avg_loss / num_batches}")
  torch.save(encoder.state_dict(), 'encoder_transformer_model.pth')

  2%|▏         | 2/100 [00:26<21:29, 13.16s/it]


KeyboardInterrupt: 

In [11]:
dev_df = pd.read_csv('dev.tsv', sep='\t', header=0, quoting=3)
dev_df['tokens'] = dev_df['text'].apply(lambda x: [token.lower() for token in wordpunct_tokenize(x)])
dev_df['token_indices'] = dev_df['tokens'].apply(tokens_to_indices)
dataset=TramsformerDataset(dev_df, vocab)
dev_dataloader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn_transformer)
test_model=Encoder_Only_Transformer(vocab_size, embedding_dim).to(device)
test_model.load_state_dict(torch.load('encoder_transformer_model.pth', map_location=torch.device('cpu')))
#test_model.load_state_dict(torch.load('encoder_transformer_model.pth'))
test_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for embeddings, labels,mask in tqdm(dev_dataloader):
        embeddings=embeddings.to(device)
        labels=labels.to(device)
        mask=mask.to(device)
        logits = test_model(embeddings, mask)
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f"Test Accuracy: {accuracy}")

  test_model.load_state_dict(torch.load('encoder_transformer_model.pth', map_location=torch.device('cpu')))
  3%|▎         | 7/207 [00:15<07:21,  2.21s/it]


KeyboardInterrupt: 

In [None]:
import torch
from nltk.tokenize import wordpunct_tokenize

def predict_sentiment(model, sentence, vocab, max_length):
    model.eval()
    
    if not sentence.strip():  # Skip empty sentences
        return -1  # Return a default value for empty sentences, or handle as needed

    # Tokenize and convert tokens to indices
    tokens = [token.lower() for token in wordpunct_tokenize(sentence)]
    token_indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    
    # Padding or truncating the sequence
    if len(token_indices) < max_length:
        token_indices += [vocab['<PAD>']] * (max_length - len(token_indices))
    else:
        token_indices = token_indices[:max_length]

    # Create tensor from token indices
    input_tensor = torch.tensor(token_indices, dtype=torch.long).unsqueeze(0)
    mask = (input_tensor != vocab['<PAD>']).long()  # Create attention mask

    with torch.no_grad():
        output = model(input_tensor, mask)  # Model output
        prediction = torch.argmax(output, dim=1).item()  # Get predicted class

    return prediction

# Example usage
test_sentence = "this is fucking gross"
prediction = predict_sentiment(test_model, test_sentence, vocab, max_length=745) 
print("Predicted Label:", prediction)

Predicted Label: 1


In [None]:
import torch
import pandas as pd
from nltk.tokenize import wordpunct_tokenize

# Reading the test data
test_file = 'test.tsv'
test_data = pd.read_csv(test_file, sep='\t', header=0, quoting=3)

# Open the output file to write predictions
output_file = 'predictions.tsv'
print(len(test_data))

12791


In [None]:
with open(output_file, 'w') as f:
    f.write('id\tpredicted\n')  # Write header

    for idx, row in test_data.iterrows():
        sentence = row['text']  # Assuming the sentence column is named 'text'
        
        try:
            prediction = predict_sentiment(test_model, sentence, vocab, max_length=745)
            if prediction == -1:  # Handle cases where there is an empty sentence or invalid data
                prediction = 'error'  # Or some default value
            f.write(f"{row['id']}\t{prediction}\n")  # Write to file
        except Exception as e:
            # If an error occurs during prediction, log the error and continue
            print(f"Error for row {row['id']}: {e}")
            f.write(f"{row['id']}\terror\n")  # Log as error in output file