[https://aclanthology.org/D14-1181.pdf] Convolutional Neural Networks for Sentence Classification by Yoon Kim

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torchmetrics.functional import accuracy
import torch.nn.functional as F

from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from gensim.models import Word2Vec

# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cpu


Unnamed: 0,label,headline
0,1,A 65-Year-Old Man's Typewriter Was <strong>Des...
1,1,Can You Identify These 5 UNITED STATES Leaders...
2,0,Index of Economic Activity Declined in March\r\n
3,1,2015's Best News Bloopers Are Here And They're...
4,1,18 Pictures Everyone Who Loves Spilling The Te...


In [6]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

# Separate dataframes into train and test lists
x_train, y_train = list(df_train['headline']), list(df_train['label'])
x_test, y_test = list(df_test['headline']), list(df_test['label'])

print(f'Number of Train Headlines: {len(x_train)}')
print(f'Number of Test Headlines: {len(x_test)}')

df_train[['label','headline']].head()

Number of Train Headlines: 19200
Number of Test Headlines: 4800


Sample Label and Headlines:
1: 27 Breathtaking Alternatives To A Traditional Wedding Bouquet <br>

1: 22 Pictures People Who Aren't Grad Students Will <strong>Never</strong> Understand

0: PepsiCo Profit Falls 43 Percent

0: Website of Bill O'Reilly, FOX News commentator, hacked in retribution

1: The Green Toy Soldiers From Your Childhood Now Come In Baller Yoga Poses A


Output of Sample Headlines without Print Statement:


['27 Breathtaking Alternatives To A Traditional Wedding Bouquet <br>\r\n',
 "22 Pictures People Who Aren't Grad Students Will <strong>Never</strong> Understand\r\n",
 'PepsiCo Profit Falls 43 Percent\r\n',
 "Website of Bill O'Reilly, FOX News commentator, hacked in retribution\r\n",
 'The Green Toy Soldiers From Your Childhood Now Come In Baller Yoga Poses A\r\n']

In [9]:
df_train_wos = pd.read_csv('./data/train_wos.csv')
df_test_wos = pd.read_csv('./data/test_wos.csv')

# Separate dataframes into train and test lists
x_train_wos, y_train_wos = list(df_train_wos['article']), list(df_train_wos['label'])
x_test_wos, y_test_wos = list(df_test_wos['article']), list(df_test_wos['label'])

# Numerical label to domain mapping
wos_label = {0:'CS', 1:'ECE', 2:'Civil', 3:'Medical'}
# Numerical label to Numerical mapping
label_mapping = {0:0, 1:1, 4:2, 5:3}

for i, label in enumerate(y_train_wos):
    y_train_wos[i] = label_mapping[label]
for i, label in enumerate(y_test_wos):
    y_test_wos[i] = label_mapping[label]

print(f'Number of Train Articles: {len(x_train_wos)}')
print(f'Number of Test Articles: {len(x_test_wos)}')

print('\nLabel Key:', wos_label)

df_train_wos[['label','article']].head()

Number of Train Articles: 1600
Number of Test Articles: 400

Label Key: {0: 'CS', 1: 'ECE', 2: 'Civil', 3: 'Medical'}


Unnamed: 0,label,article
0,1,A hardware accelerator is presented to compute...
1,0,An automatized procedure for the parameterizat...
2,0,A review and comparative analyses of methods f...
3,0,A parallel time integration method for nonline...
4,4,A redundant system of collocated geodetic sens...


In [10]:
def preprocess(data):
  preprocessed_data = []
  for text in data:
    tokens = simple_preprocess(text, deacc=True)
    preprocessed_data.append(tokens)
  return preprocessed_data

preprocessed_x_train = preprocess(x_train)
preprocessed_x_train_wos = preprocess(x_train_wos)

preprocessed_x_test = preprocess(x_test)
preprocessed_x_test_wos = preprocess(x_test_wos)


porter_stemmer = PorterStemmer()
size = 500
window = 3
min_count = 1
workers = 3
sg = 1

# Function to train word2vec model
def make_word2vec_model(data, padding=True, sg=1, min_count=1, vector_size=500, workers=3, window=3):
    data.append(['pad'])
    w2v_model = Word2Vec(data, min_count = min_count, vector_size = vector_size, workers = workers, window = window, sg = sg)
    return w2v_model

def make_word2vec_vector(sentence):
    padded_X = [padding_idx for i in range(max_sen_len)]
    i = 0
    for word in sentence:
        try:
            index = w2vmodel.wv.key_to_index[word]
        except KeyError as e:
            index = 0
        padded_X[i] = index
        i += 1
    return torch.tensor(padded_X, dtype=torch.long, device=device).view(1, -1)

def make_target(label):
  return torch.tensor([label], dtype=torch.long, device=device)

In [11]:
# Train Word2vec model
w2vmodel = make_word2vec_model(preprocessed_x_train, padding=True, sg=sg, min_count=min_count, vector_size=size, workers=workers, window=window)
max_sen_len = max(map(len, preprocessed_x_train))
padding_idx = w2vmodel.wv.key_to_index['pad']

In [None]:
class CCNClassifier(pl.LightningModule):
    def __init__(self, w2vmodel, num_classes, window_sizes=(1,2,3,5)):
        super().__init__()
        weights = w2vmodel.wv # used to initialize the embedding layer
        EMBEDDING_SIZE = 500  # Use this to set the embedding_dim in embedding layer
        NUM_FILTERS = 10      # Number of filters in CNN

        weights = []
        for key in w2vmodel.wv.index_to_key:
            weights.append(w2vmodel.wv.get_vector('to'))
        weights = torch.FloatTensor(weights)
        self.emb = nn.Embedding.from_pretrained(weights)

        conv_list = []
        for window in window_sizes:
            conv_list.append(nn.Conv2d(1, NUM_FILTERS, (window, EMBEDDING_SIZE), padding=(window - 1, 0)))
        self.convs = nn.ModuleList(conv_list)
        
        self.fc = nn.Linear(NUM_FILTERS * len(window_sizes), num_classes)

    def forward(self, x):
        x = self.emb(x)
        x = [F.tanh(conv(x).squeeze(2)) for conv in self.convs]  
        x = [F.max_pool1d(i, i.size(1)) for i in x] 
        x = torch.cat(x).squeeze(1)
        logits = self.fc(x)
        probs = F.softmax(logits, dim=0)
        return probs.unsqueeze(0)

    def _common_step(self, batch, type):
        x, y = batch
        logits = self(x)
        loss = nn.CrossEntropyLoss()(logits, y.squeeze())
        acc = accuracy(torch.argmax(logits, dim=1), 
                        y.squeeze(),
                        task='multiclass',
                        num_classes=self.num_classes)
        self.log(f'{type}_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log(f'{type}_accuracy', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self._common_step(batch, "train")
    
    def validation_step(self, batch, batch_idx):
        return self._common_step(batch, "validation")
    
    def test_step(self, batch, batch_idx):
        return self._common_step(batch, "test")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=0.0001)
        return optimizer