In [1]:
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.3.0
  Downloading https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp311-cp311-linux_x86_64.whl (839.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m839.7/839.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.18.0
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.18.0%2Bcu118-cp311-cp311-linux_x86_64.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.3.0
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.3.0%2Bcu118-cp311-cp311-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch==2.3.0)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11

In [3]:
!pip install torchtext==0.18.0 torchdata==0.8.0



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.nn.functional as F



In [5]:
data=pd.read_csv('cleaned_text.csv')
data.head()

Unnamed: 0,Text,Label
0,feel really helpless heavy hearted,4
1,ive enjoyed able slouch relax unwind frankly n...,0
2,gave internship dmrg feeling distraught,4
3,dont know feel lost,0
4,kindergarten teacher thoroughly weary job take...,4


In [6]:
X=data['Text']
y=data['Label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("X_train Shape: ",X_train.shape)
print("X_test Shape: ",X_test.shape)
print("y_train Shape: ",y_train.shape)
print("y_test Shape: ",y_test.shape)

X_train Shape:  (333447,)
X_test Shape:  (83362,)
y_train Shape:  (333447,)
y_test Shape:  (83362,)


In [7]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [8]:
tokenizer=get_tokenizer('basic_english')
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)
vocab=build_vocab_from_iterator(yield_tokens(X_train),specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [9]:
def text_to_indices(texts, vocab, tokenizer):
    return [torch.tensor([vocab[token] for token in tokenizer(text)], dtype=torch.long) for text in texts]

In [10]:
X_train_sequences = text_to_indices(X_train, vocab, tokenizer)
X_test_sequences = text_to_indices(X_test, vocab, tokenizer)

In [11]:
max_len=max(len(seq) for seq in X_train_sequences)
print(max_len)

79


In [12]:
def pad_sequence(sequences, maxlen, padding_value=0):
    padded = []
    for seq in sequences:
        if len(seq) > maxlen:
            padded.append(seq[:maxlen])  # Truncate if longer than maxlen
        else:
            padded.append(F.pad(seq, (0, maxlen - len(seq)), value=padding_value))  # Pad with 0s
    return torch.stack(padded)

In [13]:
X_train_padded = pad_sequence(X_train_sequences, max_len)
X_test_padded = pad_sequence(X_test_sequences, max_len)

In [14]:
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [15]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [16]:
train_dataset = TextDataset(X_train_padded, y_train_tensor)
test_dataset = TextDataset(X_test_padded, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [17]:
class MyTextCNN(nn.Module):
    def __init__(self, vocab_size=50000, embedding_dim=50, max_len=50, num_classes=6):
        super(MyTextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3)
        self.relu = nn.ReLU()
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.global_max_pool(x)
        x = x.squeeze(-1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [18]:
model = MyTextCNN(vocab_size=len(vocab), embedding_dim=50, max_len=max_len, num_classes=6)

In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MyTextCNN(
  (embedding): Embedding(67796, 50)
  (conv1d): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (global_max_pool): AdaptiveMaxPool1d(output_size=1)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=6, bias=True)
)

In [25]:
import os
from tqdm import tqdm
from sklearn.metrics import  classification_report,accuracy_score, precision_score, recall_score, f1_score
from torch.utils.tensorboard import SummaryWriter
num_epochs = 10
writer = SummaryWriter(log_dir='train_logs_1')
for epoch in range(num_epochs):
    model.train()
    for iteration_, (texts, labels) in enumerate(tqdm(train_loader, total=len(train_loader))):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()

        pred = model(texts)
        loss_value = criterion(pred, labels)

        loss_value.backward()
        optimizer.step()

        global_iteration = epoch * len(train_loader) + iteration_
        writer.add_scalar('train_loss_iter', loss_value.item(), global_iteration)

    print(f'Epoch={epoch+1}/{num_epochs} | Training Loss={loss_value.item():.4f}')
    writer.add_scalar('train_loss_epoch', loss_value.item(), epoch)


    model.eval()
    with torch.no_grad():
        loss_sum = 0
        pred_list, label_list = [], []

        for texts, labels in tqdm(test_loader, total=len(test_loader)):
            texts, labels = texts.to(device), labels.to(device)
            pred = model(texts)

            loss_value = criterion(pred, labels)
            loss_sum += loss_value.item()

            pred_list.extend(torch.argmax(pred, dim=1).tolist())
            label_list.extend(labels.tolist())

        avg_test_loss = loss_sum / len(test_loader)
        print(f'Test Loss={avg_test_loss:.4f}')
        writer.add_scalar('test_loss_epoch', avg_test_loss, epoch)

        final_pred = torch.tensor(pred_list)
        final_label = torch.tensor(label_list)

        epoch_accuracy = accuracy_score(final_label, final_pred)
        epoch_precision = precision_score(final_label, final_pred, average='weighted')
        epoch_recall = recall_score(final_label, final_pred, average='weighted')
        epoch_f1 = f1_score(final_label, final_pred, average='weighted')

        writer.add_scalar('test_accuracy_epoch', epoch_accuracy, epoch)
        writer.add_scalar('test_precision_epoch', epoch_precision, epoch)
        writer.add_scalar('test_recall_epoch', epoch_recall, epoch)
        writer.add_scalar('test_f1_epoch', epoch_f1, epoch)

        print(classification_report(final_label, final_pred))

    os.makedirs('ckpt', exist_ok=True)
    torch.save(model.state_dict(), os.path.join('ckpt', f'ckpt_{epoch+1}.pth'))

100%|██████████| 10421/10421 [00:35<00:00, 294.63it/s]


Epoch=1/10 | Training Loss=0.1458


100%|██████████| 2606/2606 [00:02<00:00, 1138.63it/s]


Test Loss=0.1249
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     24201
           1       0.91      0.99      0.95     28164
           2       0.99      0.70      0.82      6929
           3       0.91      0.95      0.93     11441
           4       0.89      0.90      0.89      9594
           5       0.99      0.62      0.76      3033

    accuracy                           0.93     83362
   macro avg       0.94      0.85      0.89     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 295.55it/s]


Epoch=2/10 | Training Loss=0.1074


100%|██████████| 2606/2606 [00:02<00:00, 1174.33it/s]


Test Loss=0.1176
              precision    recall  f1-score   support

           0       0.98      0.95      0.97     24201
           1       0.93      0.98      0.95     28164
           2       0.97      0.71      0.82      6929
           3       0.91      0.96      0.94     11441
           4       0.85      0.94      0.90      9594
           5       0.87      0.71      0.78      3033

    accuracy                           0.93     83362
   macro avg       0.92      0.88      0.89     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 295.64it/s]


Epoch=3/10 | Training Loss=0.0006


100%|██████████| 2606/2606 [00:02<00:00, 1147.04it/s]


Test Loss=0.1202
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     24201
           1       0.97      0.92      0.94     28164
           2       0.78      0.91      0.84      6929
           3       0.90      0.98      0.94     11441
           4       0.97      0.81      0.88      9594
           5       0.74      0.94      0.83      3033

    accuracy                           0.93     83362
   macro avg       0.89      0.92      0.90     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 294.74it/s]


Epoch=4/10 | Training Loss=0.3395


100%|██████████| 2606/2606 [00:02<00:00, 1163.39it/s]


Test Loss=0.1292
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     24201
           1       0.91      0.99      0.95     28164
           2       0.99      0.68      0.81      6929
           3       0.96      0.90      0.93     11441
           4       0.89      0.87      0.88      9594
           5       0.77      0.87      0.82      3033

    accuracy                           0.93     83362
   macro avg       0.92      0.88      0.89     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 297.12it/s]


Epoch=5/10 | Training Loss=0.0000


100%|██████████| 2606/2606 [00:02<00:00, 1189.35it/s]


Test Loss=0.1329
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     24201
           1       0.91      0.99      0.95     28164
           2       0.94      0.72      0.81      6929
           3       0.95      0.92      0.93     11441
           4       0.86      0.93      0.89      9594
           5       0.91      0.66      0.77      3033

    accuracy                           0.93     83362
   macro avg       0.93      0.86      0.89     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 296.69it/s]


Epoch=6/10 | Training Loss=0.0006


100%|██████████| 2606/2606 [00:02<00:00, 1134.89it/s]


Test Loss=0.1354
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     24201
           1       0.98      0.91      0.94     28164
           2       0.76      0.99      0.86      6929
           3       0.92      0.95      0.93     11441
           4       0.97      0.81      0.88      9594
           5       0.73      0.96      0.83      3033

    accuracy                           0.93     83362
   macro avg       0.89      0.93      0.90     83362
weighted avg       0.94      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 297.40it/s]


Epoch=7/10 | Training Loss=0.1884


100%|██████████| 2606/2606 [00:02<00:00, 941.63it/s]


Test Loss=0.1330
              precision    recall  f1-score   support

           0       0.98      0.95      0.97     24201
           1       0.96      0.92      0.94     28164
           2       0.77      0.94      0.85      6929
           3       0.92      0.96      0.94     11441
           4       0.91      0.86      0.88      9594
           5       0.76      0.88      0.82      3033

    accuracy                           0.93     83362
   macro avg       0.88      0.92      0.90     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 297.14it/s]


Epoch=8/10 | Training Loss=0.1008


100%|██████████| 2606/2606 [00:02<00:00, 1090.04it/s]


Test Loss=0.1387
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     24201
           1       0.97      0.93      0.95     28164
           2       0.79      0.92      0.85      6929
           3       0.95      0.91      0.93     11441
           4       0.86      0.93      0.89      9594
           5       0.79      0.77      0.78      3033

    accuracy                           0.93     83362
   macro avg       0.89      0.90      0.90     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 296.91it/s]


Epoch=9/10 | Training Loss=0.0010


100%|██████████| 2606/2606 [00:02<00:00, 1146.73it/s]


Test Loss=0.1443
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     24201
           1       0.92      0.99      0.95     28164
           2       0.98      0.70      0.82      6929
           3       0.97      0.90      0.93     11441
           4       0.84      0.96      0.89      9594
           5       0.92      0.64      0.76      3033

    accuracy                           0.93     83362
   macro avg       0.93      0.86      0.89     83362
weighted avg       0.93      0.93      0.93     83362



100%|██████████| 10421/10421 [00:35<00:00, 293.72it/s]


Epoch=10/10 | Training Loss=0.0095


100%|██████████| 2606/2606 [00:02<00:00, 1187.14it/s]


Test Loss=0.1517
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     24201
           1       0.98      0.91      0.94     28164
           2       0.76      0.98      0.86      6929
           3       0.95      0.91      0.93     11441
           4       0.86      0.93      0.89      9594
           5       0.87      0.69      0.77      3033

    accuracy                           0.93     83362
   macro avg       0.90      0.90      0.89     83362
weighted avg       0.93      0.93      0.93     83362



In [27]:
final_model_path = "saved_model.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab_size': len(vocab),
    'num_classes': 6 }, final_model_path)
print(f"Model saved to {final_model_path}")

Model saved to saved_model.pth
