[https://aclanthology.org/D14-1181.pdf] Convolutional Neural Networks for Sentence Classification by Yoon Kim

In [1]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torchmetrics.functional import accuracy
import torch.nn.functional as F
from torch.utils.data import TensorDataset, random_split, DataLoader
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from gensim.models import Word2Vec

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running:", device)

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
print(f'Number of Train Headlines: {len(df_train)}')
print(f'Number of Test Headlines: {len(df_test)}')
df_train.iloc[:,1:].head()

Device available for running: cuda
Number of Train Headlines: 19200
Number of Test Headlines: 4800


Unnamed: 0,headline,label
0,A 65-Year-Old Man's Typewriter Was <strong>Des...,1
1,Can You Identify These 5 UNITED STATES Leaders...,1
2,Index of Economic Activity Declined in March\n,0
3,2015's Best News Bloopers Are Here And They're...,1
4,18 Pictures Everyone Who Loves Spilling The Te...,1


headline = "A 65-Year-Old Man's Typewriter Was <strong>Destroyed</strong> By An Angry Cop, And The Internet Got Him A New One &nbsp;\n"

simple_preprocess(headline) = ['year', 'old', 'man', 'typewriter', 'was', 'strong', 'destroyed', 'strong', 'by', 'an', 'angry', 'cop', 'and', 'the', 'internet', 'got', 'him', 'new', 'one', 'nbsp']

In [2]:
def make_word2vec_vector(w2v_model, sentence):
    padded_X = [padding_idx for i in range(max_sen_len)]
    i = 0
    for word in sentence:
        try:
            index = w2v_model.wv.key_to_index[word]
        except KeyError as e:
            index = 0
        padded_X[i] = index
        i += 1
    return torch.tensor(padded_X, dtype=torch.long, device=device)#.view(1, -1)


def create_dataloader(dataset, shuffle=True, batch_size=128, num_workers=1):
    return DataLoader(dataset=dataset,
                        batch_size=batch_size, 
                        shuffle=shuffle, 
                        num_workers=num_workers,
                        pin_memory=False) # Was throwing errors with True


preprocessed_x_train = list(df_train['headline'].apply(simple_preprocess, deacc=True))
preprocessed_x_test = list(df_test['headline'].apply(simple_preprocess, deacc=True))
w2v_model = Word2Vec(preprocessed_x_train, vector_size=500, min_count=1, workers=3, window=3, sg=1)
max_sen_len = max(map(len, preprocessed_x_train))
padding_idx = w2v_model.wv.key_to_index['pad']

# Make tensor datasets
x_train_tensor = torch.stack([make_word2vec_vector(w2v_model, sentence) for sentence in preprocessed_x_train])
x_test_tensor = torch.stack([make_word2vec_vector(w2v_model, sentence) for sentence in preprocessed_x_test])
y_train_tensor = torch.tensor(df_train['label'], dtype=torch.long, device=device)
y_test_tensor = torch.tensor(df_test['label'], dtype=torch.long, device=device)

dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_size = int(0.8 * len(dataset))
validate_size = int(0.2 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, validate_size])
train_dl = create_dataloader(train_dataset)
val_dl = create_dataloader(val_dataset, shuffle=False)
test_dl = create_dataloader(TensorDataset(x_test_tensor, y_test_tensor), shuffle=False)

In [168]:
class CCNClassifier(pl.LightningModule):
    def __init__(self, w2vmodel, num_classes=2, window_sizes=(1,2,3,5)):
        super().__init__()
        self.lr = 0.0005
        self.num_classes = num_classes
        weights = w2vmodel.wv # used to initialize the embedding layer
        EMBEDDIN_GSIZE = 500  # Use this to set the embedding_dim in embedding layer
        NUM_FILTERS = 10      # Number of filters in CNN

        weights = []
        for key in w2vmodel.wv.index_to_key:
            weights.append(w2vmodel.wv.get_vector(key))
        weights = torch.FloatTensor(weights)
        self.emb = nn.Embedding.from_pretrained(weights)

        conv_list = []
        for window in window_sizes:
            conv_list.append(nn.Conv2d(1, NUM_FILTERS, (window, EMBEDDING_SIZE), padding=(window - 1, 0)))
        self.convs = nn.ModuleList(conv_list)
        
        self.fc = nn.Linear(NUM_FILTERS * len(window_sizes), self.num_classes)

    def forward(self, x):
        x = self.emb(x)
        x = [F.tanh(conv(x).squeeze(2)) for conv in self.convs]  
        x = [F.max_pool1d(i, i.size(1)) for i in x] 
        x = torch.cat(x).squeeze(1)
        logits = self.fc(x)
        probs = F.softmax(logits, dim=0)
        # return logits
        return probs.unsqueeze(0)

    def _common_step(self, batch, type):
        x, y = batch
        logits = self(x)
        # print("logits shape: ", logits.shape)
        # print("labels shape: ", y.squeeze().shape)
        loss = nn.CrossEntropyLoss()(logits, y)
        acc = accuracy(torch.argmax(logits, dim=1), 
                        y,
                        task='multiclass',
                        num_classes=self.num_classes)
        self.log(f'{type}_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log(f'{type}_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self._common_step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self._common_step(batch, "validation")
    
    def test_step(self, batch, batch_idx):
        return self._common_step(batch, "test")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=0.0001)
        return optimizer
    

model = CCNClassifier(w2v_model) 
model_name = 'CCNClassifier'
max_epochs = 5
# import os
# from pathlib import Path
# print("print wd ", os.getcwd())
torch.manual_seed(42) 
torch.set_float32_matmul_precision("high")

# wandb.init(project='RF-Fingerprinting')
# wandb_logger = WandbLogger(name=f"{model_name}", save_dir=f"../Data/Logs/{model_name}")
# csv_logger = CSVLogger(save_dir=f"../Data/Logs/{model_name}")
trainer = pl.Trainer(
    max_epochs=max_epochs,
    # logger=[wandb_logger, csv_logger],
    # enable_checkpointing=False
    # log_every_n_steps=10000
)

trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)
# wandb.finish()

In [272]:
# trainer.test(model, dataloaders=val_dl)
# trainer.test(model, dataloaders=test_dl)

In [3]:
class CCNClassifier(pl.LightningModule):
    def __init__(self, w2vmodel, num_classes=2, cnn_filters=10, window_sizes=(1,2,3,5), emb_size=500):
        super().__init__()
        self.lr = 0.0005
        self.num_classes = num_classes
        weights = torch.FloatTensor([w2vmodel.wv.get_vector(key) for key in w2vmodel.wv.index_to_key])
        self.emb = nn.Embedding.from_pretrained(weights)
        conv_list = [nn.Conv2d(1, cnn_filters, (window, emb_size), padding=(window - 1, 0)) for window in window_sizes]
        self.convs = nn.ModuleList(conv_list)
        self.fc = nn.Linear(cnn_filters * len(window_sizes), self.num_classes)

    def forward(self, x):
        x = self.emb(x).unsqueeze(1) # add a channel dimension
        x = [F.relu(conv(x).squeeze(3)) for conv in self.convs]  
        x = [F.max_pool1d(i, i.size(2)) for i in x] 
        x = torch.cat(x, dim=2).squeeze(1)
        logits = self.fc(x)
        return logits

    def _common_step(self, batch, type):
        x, y = batch
        logits = self(x)
        loss = nn.CrossEntropyLoss()(logits, y.squeeze())
        acc = accuracy(torch.argmax(logits).view(-1), 
                        y,
                        task='binary',
                        num_classes=self.num_classes)
        self.log(f'{type}_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log(f'{type}_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self._common_step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self._common_step(batch, "validation")
    
    def test_step(self, batch, batch_idx):
        return self._common_step(batch, "test")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=0.0001)
        return optimizer

In [4]:
model = CCNClassifier(w2v_model)
max_epochs = 1
trainer = pl.Trainer(max_epochs=max_epochs)
trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)

  weights = torch.FloatTensor([w2vmodel.wv.get_vector(key) for key in w2vmodel.wv.index_to_key])
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | emb   | Embedding  | 8.5 M 
1 | convs | ModuleList | 55.0 K
2 | fc    | Linear     | 82    
-------------------------------------
55.1 K    Trainable params
8.5 M     Non-trainable params
8.6 M     Total params
34.418    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1280x4 and 40x2)

In [None]:
trainer.test(model, dataloaders=val_dl)

In [242]:
trainer.test(model, dataloaders=test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.7622038125991821, 'test_accuracy': 0.50020831823349}]

In [314]:
lr = 0.0005
window_sizes = (1,2,3,5)
num_classes = 2
cnn_filters = 10
emb_size = 500
weights = torch.FloatTensor(np.array([w2v_model.wv.get_vector(key) for key in w2v_model.wv.index_to_key]))
emb = nn.Embedding.from_pretrained(weights).to(device)
conv_list = [nn.Conv2d(1, cnn_filters, (window, emb_size), padding=(window - 1, 0)) for window in window_sizes]
convs = nn.ModuleList(conv_list).to(device)
fc = nn.Linear(cnn_filters * len(window_sizes), num_classes).to(device)

In [291]:
torch.FloatTensor(np.array([w2v_model.wv.get_vector(key) for key in w2v_model.wv.index_to_key])).shape

torch.Size([17099, 500])

In [316]:
x, y = next(iter(train_dl))
x.shape, y.shape # (torch.Size([batch_size, 22]), torch.Size([batch_size]))

x = emb(x)         # torch.Size([batch_size, 22, 500])
x = x.unsqueeze(1) # torch.Size([batch_size, 1, 22, 500])
x = [F.relu(conv(x).squeeze(3)) for conv in convs]
x = [F.max_pool1d(i, i.size(2)) for i in x] 
# x = torch.cat(x, dim=1).squeeze(1) # torch.Size([batch_size, 10, 4])

In [317]:
torch.cat(x, dim=1).shape

torch.Size([128, 40, 1])

In [312]:
x = torch.cat(x, dim=2).squeeze(1)

In [318]:
fc(torch.cat(x, dim=1))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (5120x1 and 40x2)

In [None]:
x = self.emb(x).unsqueeze(1) # add a channel dimension
x = [F.relu(conv(x).squeeze(3)) for conv in self.convs] 
x = [F.max_pool1d(i, i.size(2)) for i in x] 
x = torch.cat(x, dim=2).squeeze(1)
logits = self.fc(x)
return logits

In [298]:
x.shape

torch.Size([128, 1, 22, 500])

In [307]:
convs[0](x).shape, convs[1](x).shape

(torch.Size([128, 10, 22, 1]), torch.Size([128, 10, 23, 1]))

torch.Size([32, 22, 500])

In [None]:
# logits = self(x)
# print("logits shape: ", logits.shape)
# print("x shape", x.shape)
# print("y shape", y.shape)
# print("logits", logits)
# print("y", y)
# print("labels shape: ", y.squeeze().shape)
loss = nn.CrossEntropyLoss()(logits, y.squeeze())
acc = accuracy(torch.argmax(logits).view(-1), 
                y,
                task='binary',
                num_classes=self.num_classes)
self.log(f'{type}_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log(f'{type}_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)

In [None]:
emb(x)

In [27]:
# x = self.
x = [F.tanh(conv(x).squeeze(2)) for conv in self.convs]  
x = [F.max_pool1d(i, i.size(1)) for i in x] 
x = torch.cat(x).squeeze(1)
logits = self.fc(x)
return logits

[tensor([[ 156, 6381,    0,   76,  487, 1284, 1627,    2,   78, 3766, 3766, 3766,
          3766, 3766, 3766, 3766, 3766, 3766, 3766, 3766, 3766, 3766]]),
 tensor([0])]