# **Text Classification Using CNN**

In [11]:
%load_ext watermark
%watermark -a 'Navin Kumar M 20BAI1094' -v -p torch,pandas

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Author: Navin Kumar M 20BAI1094

Python implementation: CPython
Python version       : 3.8.10
IPython version      : 7.34.0

torch : 2.1.0a0+fe05266
pandas: 1.5.2



In [2]:
import torch
import torch.nn as nn
import torchtext as tt

import pandas as pd
import chardet
import re

from torchtext.vocab import Vectors, GloVe
from torch.utils.data import Dataset
from dataclasses import dataclass

from torch.utils.data import DataLoader
from torch.optim import Adam
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

SEED = 42
torch.manual_seed(seed=SEED)
torch.cuda.manual_seed_all(seed=SEED)
torch.backends.cudnn.deterministic = True

In [3]:
@dataclass
class Config:
    vocab_size: int = 400000
    embedding_dim: int = 100
    max_seq_len: int = 30
    pretrain_emb: bool = False
    epochs: int = 10
    batch_size: int = 32
    lr: float = 0.00005

In [4]:
neg_dataset_path = 'data/rt-polarity.neg.txt'
pos_dataset_path = 'data/rt-polarity.pos.txt'

# read and return the dataset as pandas dataframe
def dataset_return(path: str) -> pd.DataFrame:
    # Check the encoding of the file
    with open(neg_dataset_path, 'rb') as f:
        result = chardet.detect(f.read(3500))
    
    return pd.read_csv(neg_dataset_path, encoding=result['encoding'], delimiter='\0', header=None, names=['text']) 

neg_dataset = dataset_return(path=neg_dataset_path)
pos_dataset = dataset_return(path=pos_dataset_path)


In [5]:
# combine the two datasets
neg_dataset['label'] = 0
pos_dataset['label'] = 1

dataset = pd.concat(
    [neg_dataset, pos_dataset], ignore_index=True
    ).sample(frac=1).reset_index(drop=True)

dataset.head()

Unnamed: 0,text,label
0,a momentary escape from the summer heat and th...,1
1,the picture emerges as a surprisingly anemic d...,1
2,. . . pays tribute to heroes the way julia ro...,0
3,the sweetest thing leaves an awful sour taste .,1
4,it doesn't do the original any particular dish...,1


In [6]:
# clean each string :
def clean_str(string: str):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

dataset['text'] = dataset['text'].apply(lambda x: clean_str(x))
dataset.head()

Unnamed: 0,text,label
0,a momentary escape from the summer heat and th...,1
1,the picture emerges as a surprisingly anemic d...,1
2,pays tribute to heroes the way julia roberts h...,0
3,the sweetest thing leaves an awful sour taste,1
4,it does n't do the original any particular dis...,1


In [7]:
token_emb = GloVe(name='6B', dim=100, cache='./.vector_cache')
token_emb.vectors[token_emb.stoi["the"]] # get the vector for the word 'the'

tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  0.8278,  0.27

## **PyTorch Dataset** 

In [8]:
import torch

class TextClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.pretrained_emb = Config.pretrain_emb

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text, label = self.dataset['text'][idx], self.dataset['label'][idx] 
        sentence = []
        if self.pretrained_emb:
            for word in text.split(' '):
                if word in self.tokenizer.stoi:
                    sentence.append(
                        self.tokenizer.vectors[self.tokenizer.stoi[word]]
                    )
                else:
                    # Handle OOV word 
                    sentence.append(torch.zeros(self.tokenizer.dim))
            sentence = torch.stack(sentence)

            # Pad the sentence if it is shorter than max_seq_len else truncate it
            if sentence.shape[0] < Config.max_seq_len:
                # pad with zeros
                pad_tensor = torch.zeros(Config.max_seq_len - sentence.shape[0], self.tokenizer.dim)
                sentence = torch.cat((sentence, pad_tensor), dim=0)
            else:
                sentence = sentence[:Config.max_seq_len]

        else:
            for word in text.split(' '):
                if word in self.tokenizer.stoi:
                    sentence.append(self.tokenizer.stoi[word])
                else:
                    # Handle OOV word
                    sentence.append(Config.vocab_size)
            sentence = torch.tensor(sentence, dtype=torch.long)
        
            # Pad the sentence if it is shorter than max_seq_len else truncate it
            if sentence.shape[0] < Config.max_seq_len:
                # pad with pading token id => vocab_size-1
                pad_tensor = torch.ones(Config.max_seq_len - sentence.shape[0], dtype=torch.long) * (Config.vocab_size)
                sentence = torch.cat((sentence, pad_tensor), dim=0)
            else:
                sentence = sentence[:Config.max_seq_len]



        return sentence, torch.tensor(label)


## **PyTorch Model**

In [9]:
class TextCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrain_emb = Config.pretrain_emb
        self.word_embeddings = nn.Embedding(
            Config.vocab_size+1, Config.embedding_dim)

        self.conv1 = nn.Conv2d(
            in_channels=1, out_channels=100, kernel_size=(3, Config.embedding_dim))
        self.conv2 = nn.Conv2d(
            in_channels=1, out_channels=100, kernel_size=(4, Config.embedding_dim))
        self.conv3 = nn.Conv2d(
            in_channels=1, out_channels=100, kernel_size=(5, Config.embedding_dim))
        
        self.fc = nn.Linear(300, 2)
        

    def forward(self, x):
        if not self.pretrain_emb:
            x = self.word_embeddings(x)
    
        x = x.unsqueeze(1)
        x1 = torch.relu(self.conv1(x)).squeeze(3)
        x1 = torch.max_pool1d(x1, x1.shape[2]).squeeze(2)
        x2 = torch.relu(self.conv2(x)).squeeze(3)
        x2 = torch.max_pool1d(x2, x2.shape[2]).squeeze(2)
        x3 = torch.relu(self.conv3(x)).squeeze(3)
        x3 = torch.max_pool1d(x3, x3.shape[2]).squeeze(2)

        x = torch.cat((x1, x2, x3), dim=1)
        x = self.fc(x)
        
        return x            

In [10]:
device = 'cuda'

data = TextClassificationDataset(dataset=dataset, tokenizer=token_emb)
model = TextCNN().to(device) 

train_size = int(0.8 * len(data))
train_data, valid_data = torch.utils.data.random_split(data, [train_size, len(data) - train_size])

train_loader = DataLoader(train_data, batch_size=Config.batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=Config.batch_size, shuffle=True)
optimizer = Adam(model.parameters(), lr=Config.lr)
loss_fn = nn.CrossEntropyLoss()

# Train and validate the model
for epoch in range(10):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        x, y = batch
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch} | Train Loss: {loss.item()}')

    with torch.no_grad():
        model.eval()
        for batch in tqdm(valid_loader):
            x, y = batch
            x, y = x.to(device), y.to(device)  
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
        
        print(f'Epoch: {epoch} | Valid Loss: {loss.item()}')


100%|██████████| 267/267 [00:30<00:00,  8.72it/s]


Epoch: 0 | Train Loss: 0.6730575561523438


100%|██████████| 67/67 [00:00<00:00, 275.62it/s]


Epoch: 0 | Valid Loss: 0.7269708514213562


100%|██████████| 267/267 [00:29<00:00,  9.08it/s]


Epoch: 1 | Train Loss: 0.7026622295379639


100%|██████████| 67/67 [00:00<00:00, 320.95it/s]


Epoch: 1 | Valid Loss: 0.6884169578552246


100%|██████████| 267/267 [00:30<00:00,  8.67it/s]


Epoch: 2 | Train Loss: 0.6836161613464355


100%|██████████| 67/67 [00:00<00:00, 236.17it/s]


Epoch: 2 | Valid Loss: 0.7523573637008667


100%|██████████| 267/267 [00:30<00:00,  8.67it/s]


Epoch: 3 | Train Loss: 0.6575700640678406


100%|██████████| 67/67 [00:00<00:00, 169.43it/s]


Epoch: 3 | Valid Loss: 0.7961407899856567


100%|██████████| 267/267 [00:32<00:00,  8.31it/s]


Epoch: 4 | Train Loss: 0.6806485056877136


100%|██████████| 67/67 [00:00<00:00, 239.29it/s]


Epoch: 4 | Valid Loss: 0.7665050029754639


100%|██████████| 267/267 [00:31<00:00,  8.47it/s]


Epoch: 5 | Train Loss: 0.7241405844688416


100%|██████████| 67/67 [00:00<00:00, 195.37it/s]


Epoch: 5 | Valid Loss: 0.7670677900314331


100%|██████████| 267/267 [00:31<00:00,  8.46it/s]


Epoch: 6 | Train Loss: 0.664065957069397


100%|██████████| 67/67 [00:00<00:00, 210.85it/s]


Epoch: 6 | Valid Loss: 0.7629090547561646


100%|██████████| 267/267 [00:31<00:00,  8.37it/s]


Epoch: 7 | Train Loss: 0.6928980350494385


100%|██████████| 67/67 [00:00<00:00, 268.82it/s]


Epoch: 7 | Valid Loss: 0.8330962061882019


 94%|█████████▍| 252/267 [00:30<00:01,  8.28it/s]


KeyboardInterrupt: 

In [None]:
# get total number of model parameters in the model in Millions
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)/1000000

print(f'The model has {count_parameters(model):.2f}M trainable parameters')

The model has 40.12M trainable parameters
