# Prereq

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import pandas as pd
import random
import nltk

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 4.2 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.5 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 32.9 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 41.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 43.2 MB/s 
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)
[K     |████

In [3]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print('Using GPU')
    print('GPU count:', torch.cuda.device_count())
    print('GPU device:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")

Using GPU
GPU count: 1
GPU device: Tesla K80


# Getting the dataset ready

## Download dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("dbpedia_14")

Downloading:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading and preparing dataset d_bpedia14/dbpedia_14 (download: 65.18 MiB, generated: 191.44 MiB, post-processed: Unknown size, total: 256.62 MiB) to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e...


Downloading:   0%|          | 0.00/68.3M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset d_bpedia14 downloaded and prepared to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
train_total = dataset['train']
# test_total = dataset['test']

train_data_x = []
train_data_y = []

for i in random.sample(range(train_total.shape[0]), 20000):
    tmp = list(train_total[i].values())
    train_data_x.append(tmp[2])
    train_data_y.append(tmp[0])

In [6]:
df = pd.DataFrame({"sentence": train_data_x, "class": train_data_y})
df.sample(10)

Unnamed: 0,sentence,class
9424,The Groundsman is a BAFTA winning short gradu...,12
17818,010 is the debut album from ulysses. The albu...,11
10627,The Missouri Fur Company (also known as the S...,0
10042,Jean Catherine Coulter (born December 26 1942...,2
14709,Pavaresia University of Vlora is a university...,1
5435,Bulnesia is a genus of flowering plants in th...,10
3410,Antioch University Los Angeles (AULA) is a sm...,1
7657,Al-Bayan Academy or Islamic School of Trenton...,1
9473,The false trevally Lactarius lactarius is spe...,9
7690,Heteropsis bicristata is a butterfly in the N...,9


## Preprocessing dataset

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

text_embedder = api.load("glove-twitter-25")

# for gensim info
# https://github.com/kavgan/nlp-in-practice/blob/master/pre-trained-embeddings/Pre-trained%20embeddings.ipynb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
import re
# regex = r"[^a-zA-Z0-9\-:;,.!?() ]+"
regex = r"[^a-zA-Z0-9,. ]+"
max_word_count = 100

def cleanSentence(sen):
    sen = sen.replace(".", ". ")
    sen = sen.replace(",", ", ")
    result = re.sub(regex, " ", sen).lower()
    result = word_tokenize(result)
    tmp = []
    for word in result:
        if(word in text_embedder and len(tmp) < max_word_count):
            tmp.append(text_embedder.vocab[word].index)
    tmp = tmp + [-1] * (max_word_count - len(tmp))
    return tmp

In [9]:
def classVector(label, count):
    tmp = [0] * count
    tmp[label] = 1
    return tmp

In [10]:
print(train_data_x[16737])
print(cleanSentence(train_data_x[16737]))
print(classVector(train_data_y[16737], 14))

 Terminalia rerei is a species of plant in the Combretaceae family. It is endemic to the Solomon Islands. It is threatened by habitat loss.
[32, 11, 22048, 39, 10746, 35, 13, 619, 1, 33, 32, 399090, 16, 13, 58882, 19278, 1, 33, 32, 28931, 152, 43300, 3954, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]


In [11]:
sentences = []
for i in range(len(train_data_x)):
    sentences.append(cleanSentence(train_data_x[i]))

labels = []
labelcount = max(train_data_y) + 1
for i in range(len(train_data_y)):
    labels.append(classVector(train_data_y[i], labelcount))

sentences = torch.tensor(sentences)
labels = torch.tensor(labels)

In [12]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pack_sequence

def collate_fn(data):
    feat, label = zip(*data)
    batch_size = len(feat)
    tok_size = len(feat[0])
    vec_size = 25
    features = []
    for index, fex in enumerate(feat):
        tmp = []
        for wordind in range(tok_size):
            if(fex[wordind] == -1):
                break
            else:
                tmp.append(torch.tensor(text_embedder[text_embedder.index2word[fex[wordind]]]))
        features.append(torch.stack(tmp))
    label = torch.stack(label)
    return pack_sequence(features, enforce_sorted=False), label

batch_size = 32

In [13]:
train_dataset = TensorDataset(sentences, labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model

In [14]:
class LSTMClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_dim, num_layers, num_classes, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.lstm = nn.LSTM(input_size=feature_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        h_0 = Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(device))
        c_0 = Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(device))
        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        h_n = h_n.view(-1, self.hidden_size)
        out = self.fc(h_n)
        out = self.softmax(out)
        return out

# Training



In [15]:
n_epochs = 10
lr = 0.01
feature_dim = 25
hidden_dim = 50
num_layers = 1

lstm = LSTMClassifier(feature_dim=feature_dim, hidden_dim=hidden_dim, num_layers=1, num_classes=labelcount, batch_size=batch_size)
lstm.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)

In [16]:
import time

for epoch in range(n_epochs):
    avg_loss = 0
    st = time.time()
    for step, batch in enumerate(train_dataloader):
        x, y = batch
        x = x.to(device)
        y = y.float().to(device)
        y_pred = lstm(x)
        optimizer.zero_grad()
        loss = criterion(y_pred, y)
        avg_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss /= len(train_dataloader)
    if(epoch % 1 == 0):
      print(f"Epoch {epoch}: loss = {avg_loss:.6f}, time taken = {time.time() - st:.3f}")

Epoch 0: loss = 2.188714, time taken = 31.842
Epoch 1: loss = 1.945928, time taken = 31.551
Epoch 2: loss = 1.880374, time taken = 31.372
Epoch 3: loss = 1.842847, time taken = 31.460
Epoch 4: loss = 1.835754, time taken = 31.573
Epoch 5: loss = 1.823533, time taken = 31.423
Epoch 6: loss = 1.816496, time taken = 31.609
Epoch 7: loss = 1.821305, time taken = 31.698
Epoch 8: loss = 1.815940, time taken = 31.746
Epoch 9: loss = 1.812547, time taken = 31.598


# Testing

In [17]:
test_total = dataset['test']

test_data_x = []
test_data_y = []

for i in random.sample(range(test_total.shape[0]), 1000):
    tmp = list(test_total[i].values())
    test_data_x.append(tmp[2])
    test_data_y.append(tmp[0])

sentences = []
for i in range(len(test_data_x)):
    sentences.append(cleanSentence(test_data_x[i]))

sentences = torch.tensor(sentences)
labels = torch.tensor(test_data_y)

test_dataset = TensorDataset(sentences, labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [18]:
test_pred_labels = []

with torch.no_grad():
    count = 0
    for step, batch in enumerate(test_dataloader):
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        y_pred = lstm(x)
        y_label = torch.argmax(y_pred, dim=1)
        test_pred_labels.append(y_label)
        count += torch.sum(y == y_label).item()

print(f'Accuracy of the LSTM model: {count * 100 / sentences.shape[0]:.2f} %')

Accuracy of the LSTM model: 93.00 %


In [19]:
from sklearn.metrics import classification_report

test_true_labels = test_data_y
test_pred_labels = torch.cat(test_pred_labels).tolist()
target_names = [f"class {i}" for i in range(labelcount)]

print(classification_report(test_true_labels, test_pred_labels, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.96      0.91      0.94        79
     class 1       0.93      0.95      0.94        75
     class 2       0.89      0.91      0.90        70
     class 3       0.99      0.95      0.97        73
     class 4       0.86      1.00      0.92        74
     class 5       0.85      0.96      0.90        74
     class 6       0.97      0.83      0.90        72
     class 7       0.97      0.99      0.98        69
     class 8       1.00      0.97      0.99        69
     class 9       0.80      0.98      0.88        60
    class 10       0.95      0.81      0.88        75
    class 11       0.95      0.98      0.96        54
    class 12       0.99      0.95      0.97        77
    class 13       0.97      0.86      0.91        79

    accuracy                           0.93      1000
   macro avg       0.93      0.93      0.93      1000
weighted avg       0.94      0.93      0.93      1000

