In [1]:
# go into requirements.txt 
# !pip -q install pandas
# !pip -q install nltk
# !pip -q install torchtext
# !pip install -q torch 
# !pip install -q pytorch_lightning
# !pip install -q torchvision
# !pip -q install torch
#!pip -q install sklearn

In [2]:
import pandas as pd
import random 
import zipfile
import numpy as np
import torch 
# with zipfile.ZipFile('./data/ag-news-classification-dataset.zip') as zf:
#     zf.extractall('./data/')

In [3]:
# datasets.py
trainpath = '../data/train.csv'
testpath = '../data/test.csv'
train = pd.read_csv(trainpath)
test = pd.read_csv(testpath)
train.head(5)

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [4]:
# concatenate Title and Description
# clean and tokenize 
from torchtext.data import get_tokenizer
from nltk.corpus import stopwords
import nltk
import re
try:
    stopwords = set(stopwords.words('english'))
except LookupError as e:
    print(f"{e}, {e.__class__}")
    nltk.download('stopwords')
    stopwords = set(stopwords.words('english'))
    

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/P76114511/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def preproc(title, descrip):
    text = title + '.' + descrip
    text = re.sub(r'[~`\-!@#$%^&*():;"{}_/?><\|.,`0-9\\]',' ',text)
    # text = nltk.word_tokenize(text)     # as suggested by senpai 
    text = text.split()
    text = [tok.lower() for tok in text if tok not in stopwords]
    return text
from torchtext.vocab import build_vocab_from_iterator
def build_vocab(datasets):
      for dataset in datasets:
        for rid, data in dataset.iterrows():
            text = data['preproc_text']
            yield text
tokenizer = get_tokenizer("basic_english")
train['preproc_text'] = train.apply(lambda x:preproc(x['Title'], x['Description']), axis=1)
test['preproc_text'] = test.apply(lambda x:preproc(x['Title'], x['Description']), axis=1)
train['label'] = train['Class Index'].apply(lambda x:x-1)
test['label'] = test['Class Index'].apply(lambda x:x-1)
# train['Class Index'].unique() # array([3, 4, 2, 1])

# class IterableTexts:
#   def __iter__(self):
#     for dataset in [trainset, testset]:
#         for data in dataset:
#             text = data['text']
#             yield list(x.lower() for x in tokenizer(text) if x.lower() not in stopwords)

In [7]:
train['label'].value_counts()

2    30000
3    30000
1    30000
0    30000
Name: label, dtype: int64

In [9]:
vocabs = build_vocab_from_iterator(build_vocab([train, test]), 
                                   specials=["[PAD]", "[UNK]"], 
                                   max_tokens =None)
vocabs.set_default_index(vocabs["[UNK]"]) # let padding be index 0 
print(len(vocabs))

69203


## Config

In [10]:
Config = {
    'IS_BIDIRECTIONAL':False,
    'SEED':42,
    'NUM_WORKER':12,
    'BATCHSIZE': 200,
    'LR':5e-4,
    'NUM_LAYERS':2,
    'MAXLEN':50,
    'DATASET':'AGNews', 
    'EPOCHS':20,
    'WEIGHT_DECAY':1e-5,  #default:0
    'DROPOUT':0.3, 
    'CLIP_GRAD':0.5,
    'NUMCHOICE':4,
    'LSTM_HDIM':256, 
    'EMBDIM':32,
    'HIDDIM':32
}
LabelMapping = {0:"World",
                1:"Sports",
                2:"Business",
                3:"Sci/Tech"}

In [11]:
def tokenize(x):
    maxlen = Config['MAXLEN']            # nltk.word_tokenize, spacy (better tokenize)
    x = x[:maxlen]                      # truncation
    tokenized_x = np.zeros(maxlen)      # padding: np.ones since [PAD]: 0
    tokenized_x[:len(x)] = np.array([vocabs[v] for v in x])
    return tokenized_x
trainX = train['preproc_text'].apply(lambda x:tokenize(x))
trainy = train['label']
from collections import Counter
Counter(trainy)

Counter({2: 30000, 3: 30000, 1: 30000, 0: 30000})

In [12]:
trainX = torch.tensor(np.vstack(trainX), dtype=torch.long)
trainy = torch.tensor(trainy, dtype=torch.long)

In [13]:
testX = test['preproc_text'].apply(lambda x:tokenize(x))
testy = test['label']
testX = torch.tensor(np.vstack(testX), dtype=torch.long)
testy = torch.tensor(testy, dtype=torch.long)

In [14]:
print(trainX.shape)
print(trainy.shape)
print(testX.shape)
print(testy.shape)

torch.Size([120000, 50])
torch.Size([120000])
torch.Size([7600, 50])
torch.Size([7600])


In [15]:
# wrap into dataloaders, collator and shuffling 
# Dataloader 
from torch.utils.data import TensorDataset, DataLoader 
trainset = TensorDataset(trainX, trainy)
testset = TensorDataset(testX, testy)
trainloader = DataLoader(trainset, shuffle = True,
                         num_workers = Config['NUM_WORKER'],
                         batch_size=Config['BATCHSIZE'])
testloader = DataLoader(testset, shuffle = False, 
                        num_workers = Config['NUM_WORKER'],
                        batch_size=Config['BATCHSIZE'])

In [16]:
r = random.randint(0, trainX.shape[0])
detokenizer = vocabs.get_itos()
print(f'Vocab Size(including specials): {len(vocabs.get_itos())}')
print(f'random index: {r}')

# “World”: 0,“Sports”: 1,“Business”: 2,“Sci/Tech”: 3

x = trainX[r,:]
print(f'tokenized example:\n {x}')
print(f'label: {LabelMapping[trainy[r].item()]}')
print(f'detokenized example:')
print([detokenizer[tok] for tok in x])

Vocab Size(including specials): 69203
random index: 1972
tokenized example:
 tensor([  992,    19,   992, 16533,  1930,   741,   594,   377,  1704,  1755,
          121,   450,   741,  8596,  6398,  7143, 19106, 10014, 16533,  1192,
         1087,    84,   594,    24,   784,   501, 10904,   992,    19,   449,
          377,    22,     2,   815,   472,  2303,   815,  2883,  2192,   885,
         3473,   725,  1231,  3686,  3161,    28,   106,     4,     0,     0])
label: Sci/Tech
detokenized example:
['hp', 'world', 'hp', 'proliant', 'delays', 'continue', 'august', 'chicago', 'hewlett', 'packard', 'co', 'customers', 'continue', 'difficulties', 'ordering', 'custom', 'configurations', "hp's", 'proliant', 'servers', 'though', 'end', 'august', 'company', 'executives', 'told', 'attendees', 'hp', 'world', 'conference', 'chicago', 'tuesday', 'the', 'problems', 'due', 'continuing', 'problems', 'sap', 'ag', 'order', 'processing', 'supply', 'chain', 'deployment', 'rolled', 'last', 'month', 'said'

In [17]:
Config['VOCAB_SIZE'] = len(vocabs)

## Model Architecture

In [18]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
from torchvision import transforms
import pytorch_lightning as pl
import torchmetrics # Accuracy
pl.utilities.seed.seed_everything(Config['SEED'])

# from keras.models import Sequential
# from keras.layers import LSTM, Dense, Embedding, Dropout

# model = Sequential()
# model.add(Embedding(input_dim= Config['VOCAB_SIZE'], 
#                     output_dim= Config['EMBDIM'],  
#                     input_length=Config['MAXLEN'], name = 'embedding'))
# model.add(LSTM(Config['LSTM_HDIM'],name = 'lstm')) 
# model.add(Dense(Config['HIDDIM'], name = 'fc')) # 這裡不應該有softmax
# model.add(Dropout(0.3, name = 'dropout'))
# model.add(Dense(Config['NUMCHOICE'], activation="softmax", name = 'classifier'))
# model.summary()

class LSTMNet(pl.LightningModule):
    def __init__(self):
        super().__init__()
        # variables
        self.num_directions = 2 if Config['IS_BIDIRECTIONAL'] else 1
        self.LossFn = nn.CrossEntropyLoss()
        self.lstm_hdim = Config['LSTM_HDIM']
        
        # layers 
        self.Embedding = nn.Embedding(Config['VOCAB_SIZE'], 
                                      Config['EMBDIM'], 
                                      padding_idx = vocabs['PAD'])
        # https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
        self.Embedding.weight.data.uniform_(-1, 1)

        self.LSTM = nn.LSTM(
            input_size = Config['EMBDIM'],
            hidden_size = Config['LSTM_HDIM'],
            num_layers = Config['NUM_LAYERS'],
            bidirectional = Config['IS_BIDIRECTIONAL'],
            batch_first = True,
        )
        self.FC = nn.Linear(Config['LSTM_HDIM']*self.num_directions, 
                            Config['HIDDIM'])
        self.Dropout = nn.Dropout(Config['DROPOUT'])  
        self.Out  = nn.Linear(Config['HIDDIM'], Config['NUMCHOICE'])
        
        # metrics
        self.accuracy = torchmetrics.Accuracy()
    
    def forward(self, x):
        # https://zhenglungwu.medium.com/pytorch%E5%AF%A6%E4%BD%9Clstm%E5%9F%B7%E8%A1%8C%E8%A8%8A%E8%99%9F%E9%A0%90%E6%B8%AC-d1d3f17549e7
        x = self.Embedding(x)
        x, (h, c) = self.LSTM(x)
        x = self.FC(x)
        x = self.Dropout(x)
        x = self.Out(x)
        x = x[:, -1]
        return x
    
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop. It is independent of forward
        X, labels = batch 
        preds = self(X)
        loss = self.LossFn(preds, labels)
        accuracy = self.accuracy(torch.argmax(preds, axis=1), 
                                 labels)
        self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc', accuracy, prog_bar=True, on_step=False, on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        # training_step defines the train loop. It is independent of forward
        X, labels = batch 
        preds = self(X)
        loss = self.LossFn(preds, labels)
        accuracy = self.accuracy(torch.argmax(preds, axis=1), labels)
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('val_acc', accuracy, prog_bar=True, on_step=False, on_epoch=True)
        return loss
    
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), 
                                     lr=Config['LR'])
        return optimizer

Global seed set to 42


In [19]:
import torch
from torchmetrics import Accuracy
target = torch.tensor([0, 3])
preds = torch.tensor([[0, 0.5, 0.8, 0.2],
                      [0.2,0.1,0.1,0.3]])
preds = torch.argmax(preds, axis=1)
print(preds)
accuracy = Accuracy()
accuracy(preds, target)

tensor([2, 3])


tensor(0.5000)

In [20]:
# set up Lightning model architecture 
model = LSTMNet()
print(model)

LSTMNet(
  (LossFn): CrossEntropyLoss()
  (Embedding): Embedding(69203, 32, padding_idx=1)
  (LSTM): LSTM(32, 256, num_layers=2, batch_first=True)
  (FC): Linear(in_features=256, out_features=32, bias=True)
  (Dropout): Dropout(p=0.3, inplace=False)
  (Out): Linear(in_features=32, out_features=4, bias=True)
  (accuracy): Accuracy()
)


In [21]:
# Trainer.fit 

pl.utilities.seed.seed_everything(Config['SEED'])
trainer = pl.Trainer(max_epochs = Config['EPOCHS'], 
                     gradient_clip_val=Config['CLIP_GRAD'],
                     accelerator="gpu", 
                     devices=1)
trainer.fit(model, trainloader, testloader)
# ignore_index不要亂設了！整個class 1都學不到！
# debugging credit to 英嘉學長

Global seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name      | Type             | Params
-----------------------------------------------
0 | LossFn    | CrossEntropyLoss | 0     
1 | Embedding | Embedding        | 2.2 M 
2 | LSTM      | LSTM             | 823 K 
3 | FC        | Linear           | 8.2 K 
4 | Dropout   | Dropout          | 0     
5 | Out       | Linear           | 132   
6 | accuracy  | Accuracy         | 0     
-----------------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
12.185    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [22]:
trainer.validate(model, dataloaders=testloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.9014473557472229
        val_loss            0.44274821877479553
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.44274821877479553, 'val_acc': 0.9014473557472229}]

## Check

In [24]:
preds = []
golds = []
model.eval()

for X, y in testloader:
    with torch.no_grad():
        y_hat = model(X)
        y_hat = torch.argmax(y_hat, axis = 1)
    golds.extend(y.numpy())
    preds.extend(y_hat.numpy())

### Confusion Matrix

In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(golds, preds)
#???? 完全沒有預測 Sports 
cm

array([[1722,   65,   51,   62],
       [  33, 1848,   11,    8],
       [ 105,   33, 1600,  162],
       [  76,   17,  126, 1681]])

In [26]:
from sklearn.metrics import accuracy_score as SKACC
SKACC(golds, preds)

0.9014473684210527