Recreation of https://www.kaggle.com/arutaki/news-classification-lstm-65-accuracy with preprocessing.

Additional sources:


*  https://galhever.medium.com/sentiment-analysis-with-pytorch-part-1-data-preprocessing-a51c80cc15fb
* https://towardsdatascience.com/nlp-in-python-data-cleaning-6313a404a470
* https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794



In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [3]:
from losses import FocalLoss, reweight
def tokenizer(text):
    return [token.text for token in spacy_en.tokenizer(text)]

import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim).to(device)
        self.lstm_head = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,
                                 dropout=dropout).to(device)
        self.lstm_desc = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,
                                 dropout=dropout).to(device)
        self.fc_head = nn.Linear(hidden_dim * 2, 100).to(device)
        self.fc_desc = nn.Linear(hidden_dim * 2, 100).to(device)
        self.fc_total = nn.Linear(200, output_dim).to(device)
        self.dropout = nn.Dropout(dropout).to(device)
    def forward(self, headline, description):
        embedded_head = self.dropout(self.embedding(headline))
        embedded_desc = self.dropout(self.embedding(description))
        output_head, (hidden_head, cell_head) = self.lstm_head(embedded_head)
        output_desc, (hidden_desc, cell_desc) = self.lstm_desc(embedded_desc)
        hidden_head = self.dropout(torch.cat((hidden_head[-2, :, :], hidden_head[-1, :, :]), dim=1))
        hidden_desc = self.dropout(torch.cat((hidden_desc[-2, :, :], hidden_desc[-1, :, :]), dim=1))
        full_head = self.fc_head(hidden_head)
        full_desc = self.fc_desc(hidden_desc)
        hidden_total = torch.cat((full_head, full_desc), 1)
        return self.fc_total(hidden_total)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad) 

def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True).to(device)
    correct = max_preds.squeeze(1).eq(y).to(device)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).to(device)

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()  
    for batch in iterator:
        optimizer.zero_grad()           
        predictions = model(batch.headline, batch.desc).squeeze(1)
        loss = criterion(predictions, batch.category)
        acc = categorical_accuracy(predictions, batch.category)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.headline, batch.desc).squeeze(1)
            loss = criterion(predictions, batch.category)
            acc = categorical_accuracy(predictions, batch.category)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)        

**VERSION ONE - INCLUDES REMOVING PUNCTUATION / STOPWORDS AND STEMMING AND LEMMATISATION**

In [4]:
import string
import re
st = string.punctuation
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words('english')
import itertools

def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
      # remove punctuation
      no_punct=[words for words in text if words not in st]
      words_wo_punct=''.join(no_punct)
      #convert string to list
      split=re.split("\W+",words_wo_punct)
      # remove stopwords
      text=[word for word in split if word not in stopword]
      ## Stemming (remove -ing, -ly, ...) ## Lemmatisation (convert the word into root word)
      ps = nltk.stem.porter.PorterStemmer()
      lst_text = [ps.stem(word) for word in text]
      lem = nltk.stem.wordnet.WordNetLemmatizer()
      lst_text = [lem.lemmatize(word) for word in lst_text]
      #append if not None or blank
      if lst_text != '' and len(lst_text)>0 and lst_text is not None:
        cleaned_text.append(lst_text)
    return list(itertools.chain.from_iterable(cleaned_text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
import torch
from torchtext import data
from torchtext.legacy import data
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

TEXT = data.Field(preprocessing=cleanup_text, tokenize = 'spacy', lower = True)
LABEL = data.LabelField()

news = data.TabularDataset(
    path='News_Category_Dataset_v2.json', format='json',
    fields={'headline': ('headline', TEXT),
            'short_description' : ('desc', TEXT),
             'category': ('category', LABEL)})

In [6]:
import random
SEED = 1234

trn, vld, tst = news.split(split_ratio=[0.7, 0.2, 0.1], random_state = random.seed(SEED))

In [7]:
vars(trn[0])

{'category': 'HEALTHY LIVING',
 'desc': ['runner',
  'appreci',
  'spro',
  'trainer',
  'design',
  'mind',
  'way',
  'build',
  'endur',
  'strength',
  'without',
  'pain',
  'come',
  'pound',
  'pavement',
  ''],
 'headline': ['', 'train', 'hard', '', 'land', 'soft', '']}

In [8]:
TEXT.build_vocab(trn,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(trn)

BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (trn, vld, tst),
    batch_size = BATCH_SIZE,
    device = device,
    sort_key= lambda x: len(x.headline),
    sort_within_batch= False
    )

In [9]:
print(len(TEXT.vocab))
print(len(LABEL.vocab))

53826
41


In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
print(f'The model has {count_parameters(model):,} trainable parameters')

pretrained_embeddings = TEXT.vocab.vectors.to(device)
model.embedding.weight.data.copy_(pretrained_embeddings)
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

The model has 10,113,729 trainable parameters


In [11]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'news_classification_model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')    

Epoch: 01 | Epoch Time: 52m 25s
	Train Loss: 1.891 | Train Acc: 49.44%
	 Val. Loss: 1.498 |  Val. Acc: 58.95%


In [12]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.420 | Test Acc: 63.05%
