[https://aclanthology.org/D14-1181.pdf] Convolutional Neural Networks for Sentence Classification by Yoon Kim

In [13]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torchmetrics.functional import accuracy
import torch.nn.functional as F
from torch.utils.data import TensorDataset, random_split, DataLoader
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from gensim.models import Word2Vec

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running:", device)

Device available for running: cpu


In [21]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

# Separate dataframes into train and test lists
x_train, y_train = list(df_train['headline']), list(df_train['label'])
x_test, y_test = list(df_test['headline']), list(df_test['label'])

print(f'Number of Train Headlines: {len(x_train)}')
print(f'Number of Test Headlines: {len(x_test)}')
df_train.iloc[:,1:].head()

Number of Train Headlines: 19200
Number of Test Headlines: 4800


Unnamed: 0,headline,label
0,A 65-Year-Old Man's Typewriter Was <strong>Des...,1
1,Can You Identify These 5 UNITED STATES Leaders...,1
2,Index of Economic Activity Declined in March\r\n,0
3,2015's Best News Bloopers Are Here And They're...,1
4,18 Pictures Everyone Who Loves Spilling The Te...,1


In [23]:
def make_word2vec_vector(w2v_model, sentence):
    padded_X = [padding_idx for i in range(max_sen_len)]
    i = 0
    for word in sentence:
        try:
            index = w2v_model.wv.key_to_index[word]
        except KeyError as e:
            index = 0
        padded_X[i] = index
        i += 1
    return torch.tensor(padded_X, dtype=torch.long, device=device)#.view(1, -1)


def preprocess(data):
  preprocessed_data = []
  for text in data:
    tokens = simple_preprocess(text, deacc=True)
    preprocessed_data.append(tokens)
  return preprocessed_data


def create_dataloader(dataset, shuffle=True, batch_size=1, num_workers=1):
    return DataLoader(dataset=dataset,
                        batch_size=batch_size, 
                        shuffle=shuffle, 
                        num_workers=num_workers,
                        pin_memory=False) # Was throwing errors with True

# Train Word2vec model
preprocessed_x_train = preprocess(x_train)
preprocessed_x_test = preprocess(x_test)
w2v_model = Word2Vec(preprocessed_x_train, vector_size=500, min_count=1, workers=3, window=3, sg=1)
max_sen_len = max(map(len, preprocessed_x_train))
padding_idx = w2v_model.wv.key_to_index['pad']

# Make tensor datasets
x_train_tensor = torch.stack([make_word2vec_vector(w2v_model, sentence) for sentence in preprocessed_x_train])
x_test_tensor = torch.stack([make_word2vec_vector(w2v_model, sentence) for sentence in preprocessed_x_test])
y_train_tensor = torch.tensor(y_train, dtype=torch.long, device=device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long, device=device)

dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_size = int(0.8 * len(dataset))
validate_size = int(0.2 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, validate_size])
train_dl = create_dataloader(train_dataset)
val_dl = create_dataloader(val_dataset, shuffle=False)
test_dl = create_dataloader(TensorDataset(x_test_tensor, y_test_tensor), shuffle=False)

In [168]:
class CCNClassifier(pl.LightningModule):
    def __init__(self, w2vmodel, num_classes=2, window_sizes=(1,2,3,5)):
        super().__init__()
        self.lr = 0.0005
        self.num_classes = num_classes
        weights = w2vmodel.wv # used to initialize the embedding layer
        EMBEDDING_SIZE = 500  # Use this to set the embedding_dim in embedding layer
        NUM_FILTERS = 10      # Number of filters in CNN

        weights = []
        for key in w2vmodel.wv.index_to_key:
            weights.append(w2vmodel.wv.get_vector('to'))
        weights = torch.FloatTensor(weights)
        self.emb = nn.Embedding.from_pretrained(weights)

        conv_list = []
        for window in window_sizes:
            conv_list.append(nn.Conv2d(1, NUM_FILTERS, (window, EMBEDDING_SIZE), padding=(window - 1, 0)))
        self.convs = nn.ModuleList(conv_list)
        
        self.fc = nn.Linear(NUM_FILTERS * len(window_sizes), self.num_classes)

    def forward(self, x):
        x = self.emb(x)
        x = [F.tanh(conv(x).squeeze(2)) for conv in self.convs]  
        x = [F.max_pool1d(i, i.size(1)) for i in x] 
        x = torch.cat(x).squeeze(1)
        logits = self.fc(x)
        probs = F.softmax(logits, dim=0)
        # return logits
        return probs.unsqueeze(0)

    def _common_step(self, batch, type):
        x, y = batch
        logits = self(x)
        # print("logits shape: ", logits.shape)
        # print("labels shape: ", y.squeeze().shape)
        loss = nn.CrossEntropyLoss()(logits, y)
        acc = accuracy(torch.argmax(logits, dim=1), 
                        y,
                        task='multiclass',
                        num_classes=self.num_classes)
        self.log(f'{type}_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log(f'{type}_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self._common_step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self._common_step(batch, "validation")
    
    def test_step(self, batch, batch_idx):
        return self._common_step(batch, "test")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=0.0001)
        return optimizer

In [169]:
model = CCNClassifier(w2vmodel) 
model_name = 'CCNClassifier'
max_epochs = 5
# import os
# from pathlib import Path
# print("print wd ", os.getcwd())
torch.manual_seed(42) 
torch.set_float32_matmul_precision("high")
print("Using device: %s" % device)


# wandb.init(project='RF-Fingerprinting')
# wandb_logger = WandbLogger(name=f"{model_name}", save_dir=f"../Data/Logs/{model_name}")
# csv_logger = CSVLogger(save_dir=f"../Data/Logs/{model_name}")
trainer = pl.Trainer(
    max_epochs=max_epochs,
    # logger=[wandb_logger, csv_logger],
    # enable_checkpointing=False
    # log_every_n_steps=10000
)

trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)
# wandb.finish()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | emb   | Embedding  | 8.5 M 
1 | convs | ModuleList | 55.0 K
2 | fc    | Linear     | 82    
-------------------------------------
55.1 K    Trainable params
8.5 M     Non-trainable params
8.6 M     Total params
34.418    Total estimated model params size (MB)


Using device: cuda


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [173]:
trainer.test(model, dataloaders=test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.813026487827301, 'test_accuracy': 0.50020831823349}]

In [174]:
trainer.test(model, dataloaders=val_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.8153160810470581, 'test_accuracy': 0.49791666865348816}]

In [241]:
class CCNClassifier(pl.LightningModule):
    def __init__(self, w2vmodel, num_classes=2, cnn_filters=10, window_sizes=(1,2,3,5), emb_size=500):
        super().__init__()
        self.lr = 0.0005
        self.num_classes = num_classes
        weights = torch.FloatTensor([w2vmodel.wv.get_vector(key) for key in w2vmodel.wv.index_to_key])
        self.emb = nn.Embedding.from_pretrained(weights)

        conv_list = [nn.Conv2d(1, cnn_filters, (window, emb_size), padding=(window - 1, 0)) for window in window_sizes]
        self.convs = nn.ModuleList(conv_list)
        
        self.fc = nn.Linear(cnn_filters * len(window_sizes), self.num_classes)

    def forward(self, x):
        x = self.emb(x)
        x = [F.tanh(conv(x).squeeze(2)) for conv in self.convs]  
        x = [F.max_pool1d(i, i.size(1)) for i in x] 
        x = torch.cat(x).squeeze(1)
        logits = self.fc(x)
        return logits
        # probs = F.softmax(logits, dim=0)
        # return probs.unsqueeze(0)

    def _common_step(self, batch, type):
        x, y = batch
        logits = self(x)
        # print("logits shape: ", logits.shape)
        # print("x shape", x.shape)
        # print("y shape", y.shape)
        # print("logits", logits)
        # print("y", y)
        # print("labels shape: ", y.squeeze().shape)
        loss = nn.CrossEntropyLoss()(logits, y.squeeze())
        acc = accuracy(torch.argmax(logits).view(-1), 
                        y,
                        task='binary',
                        num_classes=self.num_classes)
        self.log(f'{type}_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log(f'{type}_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self._common_step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self._common_step(batch, "validation")
    
    def test_step(self, batch, batch_idx):
        return self._common_step(batch, "test")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=0.0001)
        return optimizer
    

model = CCNClassifier(w2vmodel)
max_epochs = 1
trainer = pl.Trainer(max_epochs=max_epochs)
trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | emb   | Embedding  | 8.5 M 
1 | convs | ModuleList | 55.0 K
2 | fc    | Linear     | 82    
-------------------------------------
55.1 K    Trainable params
8.5 M     Non-trainable params
8.6 M     Total params
34.418    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [242]:
trainer.test(model, dataloaders=test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.7622038125991821, 'test_accuracy': 0.50020831823349}]

In [33]:
lr = 0.0005
window_sizes = (1,2,3,5)
num_classes = 2
cnn_filters = 10
emb_size = 500
weights = torch.FloatTensor(np.array([w2vmodel.wv.get_vector(key) for key in w2vmodel.wv.index_to_key]))
emb = nn.Embedding.from_pretrained(weights)

conv_list = [nn.Conv2d(1, cnn_filters, (window, emb_size), padding=(window - 1, 0)) for window in window_sizes]
convs = nn.ModuleList(conv_list)
fc = nn.Linear(cnn_filters * len(window_sizes), num_classes)

In [39]:
def create_dataloader(dataset, shuffle=True, batch_size=32, num_workers=1):
    return DataLoader(dataset=dataset,
                        batch_size=batch_size, 
                        shuffle=shuffle, 
                        num_workers=num_workers,
                        pin_memory=False) # Was throwing errors with True

dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_dl = create_dataloader(dataset)

In [40]:
x, y = next(iter(train_dl))
x.shape, y.shape # (torch.Size([1, 22]), torch.Size([1]))

(torch.Size([32, 22]), torch.Size([32]))

(torch.Size([1, 22]), torch.Size([1]))

In [35]:
emb(x).shape

torch.Size([1, 22, 500])

In [None]:
# logits = self(x)
# print("logits shape: ", logits.shape)
# print("x shape", x.shape)
# print("y shape", y.shape)
# print("logits", logits)
# print("y", y)
# print("labels shape: ", y.squeeze().shape)
loss = nn.CrossEntropyLoss()(logits, y.squeeze())
acc = accuracy(torch.argmax(logits).view(-1), 
                y,
                task='binary',
                num_classes=self.num_classes)
self.log(f'{type}_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log(f'{type}_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)

In [None]:
emb(x)

In [27]:
x = self.
x = [F.tanh(conv(x).squeeze(2)) for conv in self.convs]  
x = [F.max_pool1d(i, i.size(1)) for i in x] 
x = torch.cat(x).squeeze(1)
logits = self.fc(x)
return logits

[tensor([[ 156, 6381,    0,   76,  487, 1284, 1627,    2,   78, 3766, 3766, 3766,
          3766, 3766, 3766, 3766, 3766, 3766, 3766, 3766, 3766, 3766]]),
 tensor([0])]

array([[ 0.12618816,  0.12382559,  0.2715929 , ..., -0.0252528 ,
        -0.03258467, -0.0512457 ],
       [ 0.04130232,  0.03092547,  0.03194333, ...,  0.11940952,
        -0.08433206,  0.0188638 ],
       [ 0.2184581 ,  0.11401746,  0.25613508, ..., -0.0413998 ,
        -0.13421682, -0.00919781],
       ...,
       [ 0.00786052,  0.00455297,  0.00892375, ..., -0.00130982,
        -0.00713709,  0.00149295],
       [ 0.01807955,  0.0137573 ,  0.02255729, ...,  0.00122181,
        -0.01906485, -0.0035723 ],
       [ 0.01621657,  0.01486751,  0.02367726, ..., -0.0023468 ,
        -0.01647809, -0.0009782 ]], dtype=float32)