In [1]:
!pip install torch==2.3.0 torchtext==0.18.0 torchdata==0.8.0



In [2]:
!pip install tiktoken



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tiktoken
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import math

In [4]:
data=pd.read_csv('/content/drive/MyDrive/kaggle api/cleaned_text.csv')
data.head()

Unnamed: 0,Text,Label
0,feel really helpless heavy hearted,4
1,ive enjoyed able slouch relax unwind frankly n...,0
2,gave internship dmrg feeling distraught,4
3,dont know feel lost,0
4,kindergarten teacher thoroughly weary job take...,4


In [5]:
X=data['Text']
y=data['Label']
X=X.astype(str)
X=X.to_list()
y=y.to_list()
data.dropna()

Unnamed: 0,Text,Label
0,feel really helpless heavy hearted,4
1,ive enjoyed able slouch relax unwind frankly n...,0
2,gave internship dmrg feeling distraught,4
3,dont know feel lost,0
4,kindergarten teacher thoroughly weary job take...,4
...,...,...
416804,feel like telling horny devils find site suite...,2
416805,began realize feeling agitated restless would ...,3
416806,feel curious previous early dawn time seek tro...,5
416807,feel becuase tyranical nature government el sa...,3


In [6]:
data.value_counts("Label")

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,141067
0,121187
3,57317
4,47712
2,34554
5,14972


In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode(text)
        input_ids = torch.tensor(encoding, dtype=torch.long)
        return input_ids, label

In [8]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    max_length = max(len(ids) for ids in input_ids)
    input_ids = torch.stack([torch.cat([ids, torch.zeros(max_length - len(ids), dtype=torch.long)]) for ids in input_ids])
    labels = torch.tensor(labels, dtype=torch.long)
    return input_ids, labels

In [9]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        # Linear layers for Q, K, V matrices
        self.wq = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.wk = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.wv = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)

        # Output linear transformation
        self.dense = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)   self.

        # Feed-forward network
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff)
            nn.ReLU(),
            nn.Linear(d_ff, d_model)  # (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
        )

        # Layer normalization and dropout
        self.layernorm1 = nn.LayerNorm(d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.layernorm2 = nn.LayerNorm(d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x, batch_size):
        # Split the last dimension into (num_heads, depth)
        x = x.view(batch_size, -1, self.num_heads, self.depth)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, num_heads, depth)
        # Transpose the result to shape (batch_size, num_heads, seq_len, depth)
        return x.transpose(1, 2)  # (batch_size, seq_len, num_heads, depth) -> (batch_size, num_heads, seq_len, depth)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))  # (batch_size, num_heads, seq_len_q, seq_len_k)
        dk = torch.tensor(k.size(-1), dtype=torch.float32)  # scalar
        scaled_attention_logits = matmul_qk / torch.sqrt(dk)  # (batch_size, num_heads, seq_len_q, seq_len_k)

        if mask is not None:
            scaled_attention_logits = scaled_attention_logits.masked_fill(mask == 0, -1e9)

        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)
        output = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len_q, depth_v)

        return output, attention_weights  # (batch_size, num_heads, seq_len_q, depth_v), (batch_size, num_heads, seq_len_q, seq_len_k)

    def forward(self, x, mask=None):
        batch_size = x.size(0)  # (batch_size, seq_len, d_model)

        # Apply linear layers and split into heads
        q = self.split_heads(self.wq(x), batch_size)  # (batch_size, num_heads, seq_len, depth)
        k = self.split_heads(self.wk(x), batch_size)  # (batch_size, num_heads, seq_len, depth)
        v = self.split_heads(self.wv(x), batch_size)  # (batch_size, num_heads, seq_len, depth)

        # Apply the custom scaled dot-product attention
        scaled_attention, _ = self.scaled_dot_product_attention(q, k, v, mask)  # (batch_size, num_heads, seq_len_q, depth_v)

        # Transpose and reshape back to (batch_size, seq_len, d_model)
        scaled_attention = scaled_attention.transpose(1, 2).contiguous()  # (batch_size, seq_len, num_heads, depth)
        concat_attention = scaled_attention.view(batch_size, -1, self.d_model)  # (batch_size, seq_len, d_model)

        # Apply the final linear layer to combine the heads
        attn_output = self.dense(concat_attention)  # (batch_size, seq_len, d_model)

        # Add & Norm
        x = self.layernorm1(x + self.dropout(attn_output))  # (batch_size, seq_len, d_model)

        # Feed-forward
        ff_output = self.feed_forward(x)  # (batch_size, seq_len, d_model)

        # Add & Norm
        x = self.layernorm2(x + self.dropout(ff_output))  # (batch_size, seq_len, d_model)

        return x  # (batch_size, seq_len, d_model)

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # (d_model/2,)

        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, d_model)
        Returns:
            x: Tensor with positional encodings added (batch_size, seq_len, d_model)
        """
        x = x + self.pe[:, :x.size(1), :].to(x.device)  # Add positional encoding to input
        return x

In [11]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.embedding(x)  # (batch_size, seq_len, embed_size)
        x = self.positional_encoding(x)  # (batch_size, seq_len, d_model)
        for layer in self.encoder_layers:
            x = layer(x, mask)  # (batch_size, seq_len, d_model)
        x = x.mean(dim=1)  # (batch_size, d_model)
        x = self.fc(self.dropout(x))  # (batch_size, output_size)
        return x


In [12]:
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
# Extract texts and labels from dataframes
train_texts = train_df['Text'].astype(str).tolist()
train_labels = train_df['Label'].tolist()
val_texts = val_df['Text'].astype(str).tolist()
val_labels = val_df['Label'].tolist()

In [13]:
tokenizer = tiktoken.get_encoding('p50k_base')
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

In [14]:
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # Print the device being used

Using device: cuda


In [16]:
vocab_size = tokenizer.n_vocab
embed_size = 256
d_model = 256
num_heads = 8
d_ff = 512
output_size = len(train_df['Label'].unique())
print(output_size)
num_layers = 4
dropout = 0.1

model = TransformerModel(vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout)
model.to(device)
# Training loop
num_epochs = 20
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

6


In [18]:
from sklearn.metrics import classification_report
for epoch in range(num_epochs):
    model.train()
    for input_ids, labels in train_dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # Validation step
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, labels in val_dataloader:
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            outputs = model(input_ids)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {accuracy:.2f}%")
    if epoch + 1 == num_epochs or (epoch + 1) % 5 == 0:  # Print report every 5 epochs or at the end
    # Collect all predictions and true labels
      all_preds = []
      all_labels = []
      model.eval()
      with torch.no_grad():
          for input_ids, labels in val_dataloader:
              input_ids = input_ids.to(device)
              labels = labels.to(device)
              outputs = model(input_ids)
              _, predicted = torch.max(outputs, 1)

              all_preds.extend(predicted.cpu().numpy())
              all_labels.extend(labels.cpu().numpy())

      # Generate and print classification report
      print("\nClassification Report:")
      report = classification_report(all_labels, all_preds)
      print(report)

Epoch 1/20, Loss: 0.132680743932724
Validation Accuracy after Epoch 1: 90.47%
Epoch 2/20, Loss: 0.06132703647017479
Validation Accuracy after Epoch 2: 91.32%
Epoch 3/20, Loss: 0.040304411202669144
Validation Accuracy after Epoch 3: 91.05%
Epoch 4/20, Loss: 0.16025032103061676
Validation Accuracy after Epoch 4: 91.60%
Epoch 5/20, Loss: 0.0808694139122963
Validation Accuracy after Epoch 5: 91.71%

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95     24201
           1       0.97      0.90      0.93     28164
           2       0.73      0.98      0.84      6929
           3       0.88      0.96      0.92     11441
           4       0.93      0.83      0.87      9594
           5       0.82      0.76      0.79      3033

    accuracy                           0.92     83362
   macro avg       0.88      0.90      0.89     83362
weighted avg       0.92      0.92      0.92     83362

Epoch 6/20, Loss: 0.069652155041694

In [20]:
# Save the complete model (architecture + weights)
torch.save(model, 'complete_transformer_model.pt')

# Alternatively, save just the state dictionary (recommended approach)
torch.save(model.state_dict(), 'transformer_model_weights.pt')

print("Model saved successfully!")

Model saved successfully!
