In [1]:
import pandas as pd
df = pd.read_csv('train.tsv', sep='\t', header=0, quoting=3)

In [2]:
df.head(5)

Unnamed: 0,id,text,label
0,eng_train0,I supported Barack Obama. I thought it was abs...,0
1,eng_train1,what to hell with that!,1
2,eng_train2,"and the stupidity of the haters continues, thi...",1
3,eng_train3,Alberta has been in debt under the Conservativ...,0
4,eng_train4,"The TV is in Channel Search mode, and I have p...",0


In [3]:
import nltk
from nltk.tokenize import wordpunct_tokenize
nltk.download('punkt', quiet=True)

df['tokens'] = df['text'].apply(lambda x: [token.lower() for token in wordpunct_tokenize(x)])

In [4]:
df.head()
print(df['label'].value_counts())

label
0    62530
1    36470
Name: count, dtype: int64


In [5]:
# for the word embeddings we will manually create a TF-IDF vectorizer
# import math

# doc_freq = {}
# for tokens in df['tokens']:
#     unique_tokens = set(tokens)
#     for token in unique_tokens:
#         doc_freq[token] = doc_freq.get(token, 0) + 1
    
# total_docs = len(df)
# idf = {}
# for token in doc_freq:
#     idf[token] = math.log((total_docs + 1) / (doc_freq[token] + 1)) + 1

# def compute_tfidf_for_document(tokens):
#     if not tokens:
#         return []
#     tf = {}
#     doc_length = len(tokens)
#     for token in tokens:
#         tf[token] = tf.get(token, 0) + 1
#     return [(tf[token]/doc_length) * idf.get(token, 0) for token in tokens]

# df['tfidf_embedding'] = df['tokens'].apply(compute_tfidf_for_document)


In [6]:
from collections import Counter

# Step 1: Build a vocabulary
all_tokens = [token for tokens in df['tokens'] for token in tokens]
vocab = {token: idx+2 for idx, (token, _) in enumerate(Counter(all_tokens).items())}  # idx+2 to reserve 0 for padding and 1 for unknown
vocab['<PAD>'] = 0  # Padding token
vocab['<UNK>'] = 1  # Unknown token

# Step 2: Convert tokens into indices
def tokens_to_indices(tokens):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

df['token_indices'] = df['tokens'].apply(tokens_to_indices)

In [7]:
df.head(5)

Unnamed: 0,id,text,label,tokens,token_indices
0,eng_train0,I supported Barack Obama. I thought it was abs...,0,"[i, supported, barack, obama, ., i, thought, i...","[2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 11, 12, 13, 14..."
1,eng_train1,what to hell with that!,1,"[what, to, hell, with, that, !]","[84, 38, 85, 86, 13, 43]"
2,eng_train2,"and the stupidity of the haters continues, thi...",1,"[and, the, stupidity, of, the, haters, continu...","[11, 87, 88, 35, 87, 89, 90, 21, 91, 92, 93, 9..."
3,eng_train3,Alberta has been in debt under the Conservativ...,0,"[alberta, has, been, in, debt, under, the, con...","[112, 113, 114, 115, 116, 117, 87, 14, 21, 11,..."
4,eng_train4,"The TV is in Channel Search mode, and I have p...",0,"[the, tv, is, in, channel, search, mode, ,, an...","[87, 129, 94, 115, 130, 131, 132, 21, 11, 2, 1..."


In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

# Update MyDataset
class MyDataset(Dataset):
    def __init__(self, df, vocab):
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
        self.token_indices = [torch.tensor(indices, dtype=torch.long) for indices in df["token_indices"].values]
        self.max_length = max(len(indices) for indices in self.token_indices) if self.token_indices else 0
        self.vocab = vocab

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        indices = self.token_indices[idx]
        if len(indices) < self.max_length:
            padding = torch.zeros(self.max_length - len(indices), dtype=torch.long)
            padded_indices = torch.cat([indices, padding])
        else:
            padded_indices = indices[:self.max_length]
        return padded_indices, self.labels[idx]

def collate_fn(batch):
    indices, labels = zip(*batch)
    padded_indices = torch.stack(indices)
    labels = torch.stack(labels)
    return padded_indices, labels

class_counts = df['label'].value_counts()
total_samples = len(df)
class_weights = torch.tensor([total_samples / (len(class_counts) * count) for count in class_counts])

# Define the sampler
sampler = torch.utils.data.WeightedRandomSampler(class_weights, len(class_weights))

dataset = MyDataset(df, vocab)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn, sampler=sampler)

# Iterate through the DataLoader to check the output
for padded_embeddings, labels in dataloader:
    print(padded_embeddings.shape)
    print(labels)
    break

torch.Size([2, 745])
tensor([0, 1])


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

class Encoder_Only_RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, dropout=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])  # Embedding layer
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        embedded = self.embedding(input)  # Convert indices to embeddings
        output, hidden = self.rnn(embedded, hidden)
        output = self.dropout(output)
        last_output = output[:, -1, :]  # Get last timestep output
        return self.fc(last_output)

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

In [24]:
# Initialize the model
vocab_size = len(vocab)
embedding_dim = 100  # You can adjust this dimension
encoder = Encoder_Only_RNN(vocab_size, embedding_dim, hidden_size=128, num_classes=2)

# Hyperparameters
EPOCHS = 100
optimizer = optim.Adam(encoder.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(EPOCHS):
    avg_loss = 0
    for embeddings, labels in dataloader:
        optimizer.zero_grad()

        # Initialize hidden state for each batch
        batch_size = embeddings.size(0)
        hidden = encoder.init_hidden(batch_size)

        # Forward pass
        logits = encoder(embeddings, hidden)

        # Calculate loss
        loss = criterion(logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()

    print(f"Epoch: {epoch+1}, Loss: {avg_loss / len(dataloader)}")

torch.save(encoder.state_dict(), 'encoder_model.pth')

Epoch: 1, Loss: 0.6902402639389038
Epoch: 2, Loss: 0.6481857895851135
Epoch: 3, Loss: 0.5908253192901611
Epoch: 4, Loss: 0.5325659513473511
Epoch: 5, Loss: 0.48368948698043823
Epoch: 6, Loss: 0.41308027505874634
Epoch: 7, Loss: 0.777518093585968
Epoch: 8, Loss: 0.7674357891082764
Epoch: 9, Loss: 0.30194804072380066
Epoch: 10, Loss: 0.26162683963775635
Epoch: 11, Loss: 0.23390445113182068
Epoch: 12, Loss: 1.0104020833969116
Epoch: 13, Loss: 0.16679181158542633
Epoch: 14, Loss: 2.0059618949890137
Epoch: 15, Loss: 0.19270765781402588
Epoch: 16, Loss: 0.20669353008270264
Epoch: 17, Loss: 0.9078084230422974
Epoch: 18, Loss: 0.8973689675331116
Epoch: 19, Loss: 1.4234565496444702
Epoch: 20, Loss: 0.8388950824737549
Epoch: 21, Loss: 0.8127148151397705
Epoch: 22, Loss: 0.7647733688354492
Epoch: 23, Loss: 0.762736976146698
Epoch: 24, Loss: 0.7702496647834778
Epoch: 25, Loss: 0.4343477487564087
Epoch: 26, Loss: 1.0435521602630615
Epoch: 27, Loss: 0.7235577702522278
Epoch: 28, Loss: 0.686477243900

In [25]:
dev_df = pd.read_csv('dev.tsv', sep='\t', header=0, quoting=3)
dev_df['tokens'] = dev_df['text'].apply(lambda x: [token.lower() for token in wordpunct_tokenize(x)])
dev_df['token_indices'] = dev_df['tokens'].apply(tokens_to_indices)

dev_dataset = MyDataset(dev_df, vocab)
dev_dataloader = DataLoader(dev_dataset, batch_size=64, collate_fn=collate_fn)

# Testing loop
encoder.eval()  # Set the model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for embeddings, labels in dev_dataloader:
        batch_size = embeddings.size(0)
        hidden = encoder.init_hidden(batch_size)

        logits = encoder(embeddings, hidden)
        _, predicted = torch.max(logits.data, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.3431060606060606
