# Prereq

In [23]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import pandas as pd
import random
import nltk

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 5.1 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 51.2 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 55.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.6 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 66.4 MB/s 
Collecting asynctest==0.13.0
  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.ma

In [3]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print('Using GPU')
    print('GPU count:', torch.cuda.device_count())
    print('GPU device:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")

Using CPU


# Getting the dataset ready

## Download dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("dbpedia_14")

Downloading:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading and preparing dataset d_bpedia14/dbpedia_14 (download: 65.18 MiB, generated: 191.44 MiB, post-processed: Unknown size, total: 256.62 MiB) to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e...


Downloading:   0%|          | 0.00/68.3M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset d_bpedia14 downloaded and prepared to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
train_total = dataset['train']
# test_total = dataset['test']

train_data_x = []
train_data_y = []

for i in random.sample(range(train_total.shape[0]), 20000):
    tmp = list(train_total[i].values())
    train_data_x.append(tmp[2])
    train_data_y.append(tmp[0])

In [6]:
df = pd.DataFrame({"sentence": train_data_x, "class": train_data_y})
df.sample(10)

Unnamed: 0,sentence,class
1410,Rampid Interactive is a game development and ...,0
19456,The endangered flower Delphinium luteum the y...,10
7080,The Oracle and the Mountains is a short story...,13
4003,Deh Now-e Allah Morad (Persian: دهنواله مراد‎...,8
6364,The Black Mamo (Drepanis funerea) is an extin...,9
15098,The Suru River is a headwater of the Boia Mic...,7
11543,George F. Scully Jr. (born February 28 1952) ...,4
5232,The Broadway Winter Hill Congregational Churc...,6
11461,Daniil Ratnikov (born 10 February 1988) is an...,3
9629,The Cleveland School near Clayton North Carol...,6


## Preprocessing dataset

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

text_embedder = api.load("glove-twitter-25")

# for gensim info
# https://github.com/kavgan/nlp-in-practice/blob/master/pre-trained-embeddings/Pre-trained%20embeddings.ipynb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
import re
# regex = r"[^a-zA-Z0-9\-:;,.!?() ]+"
regex = r"[^a-zA-Z0-9,. ]+"
max_word_count = 100

def cleanSentence(sen):
    sen = sen.replace(".", ". ")
    sen = sen.replace(",", ", ")
    result = re.sub(regex, " ", sen).lower()
    result = word_tokenize(result)
    tmp = []
    for word in result:
        if(word in text_embedder and len(tmp) < max_word_count):
            tmp.append(text_embedder.vocab[word].index)
    tmp = tmp + [-1] * (max_word_count - len(tmp))
    return tmp

In [9]:
def classVector(label, count):
    tmp = [0] * count
    tmp[label] = 1
    return tmp

In [10]:
print(train_data_x[16737])
print(cleanSentence(train_data_x[16737]))
print(classVector(train_data_y[16737], 14))

 Art Thief Musical! is a 2004 American short musical film by Linus Lau. It stars Autumn Reeser Benjamin Sprunger Matt O'Toole and Sean Smith. It premiered at the Palm Springs International Festival of Short Films in September 2004. Based loosely on Jacques Demy's The Umbrellas of Cherbourg all of the dialogue throughout the entire film is sung except for the last two lines.
[1809, 20667, 5129, 32, 11, 2033, 1568, 5129, 1251, 152, 110586, 7106, 1, 33, 2476, 16352, 15143, 3794, 50, 466722, 26, 2853, 4363, 1, 33, 117642, 66, 13, 14085, 20222, 3959, 3099, 39, 1568, 6866, 35, 5019, 4613, 113873, 46, 51588, 288101, 137, 13, 95329, 39, 427078, 75, 39, 13, 34282, 11740, 13, 3154, 1251, 32, 19331, 3157, 37, 13, 288, 568, 5023, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [11]:
sentences = []
for i in range(len(train_data_x)):
    sentences.append(cleanSentence(train_data_x[i]))

labels = []
labelcount = max(train_data_y) + 1
for i in range(len(train_data_y)):
    labels.append(classVector(train_data_y[i], labelcount))

sentences = torch.tensor(sentences)
labels = torch.tensor(labels)

In [99]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pack_sequence

def collate_fn(data):
    feat, label = zip(*data)
    batch_size = len(feat)
    tok_size = len(feat[0])
    vec_size = 25
    features = []
    for index, fex in enumerate(feat):
        tmp = []
        for wordind in range(tok_size):
            if(fex[wordind] == -1):
                break
            else:
                tmp.append(torch.tensor(text_embedder[text_embedder.index2word[fex[wordind]]]))
        features.append(torch.stack(tmp))
    label = torch.stack(label)
    return pack_sequence(features, enforce_sorted=False), label

batch_size = 32

In [100]:
train_dataset = TensorDataset(sentences, labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model

In [98]:
class LSTMClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_dim, num_layers, num_classes, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.lstm = nn.LSTM(input_size=feature_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        h_0 = Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size))
        c_0 = Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size))
        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        h_n = h_n.view(-1, self.hidden_size)
        out = self.fc(h_n)
        out = self.softmax(out)
        return out

# Training



In [95]:
n_epochs = 10
lr = 0.01
feature_dim = 25
hidden_dim = 50
num_layers = 1

lstm = LSTMClassifier(feature_dim=feature_dim, hidden_dim=hidden_dim, num_layers=1, num_classes=labelcount)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)

In [96]:
for epoch in range(n_epochs):
    avg_loss = 0
    for step, batch in enumerate(train_dataloader):
        x, y = batch
        y_pred = lstm(x)
        optimizer.zero_grad()
        # print(y_pred.shape, y.shape, y_pred.dtype, y.float().dtype)
        loss = criterion(y_pred, y.float())
        avg_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss /= len(train_dataloader)
    if(epoch % 1 == 0):
      print(f"Epoch {epoch}: loss = {avg_loss}")

Epoch 0: loss = 2.257883528327942
Epoch 1: loss = 1.9540866170883178
Epoch 2: loss = 1.8793147497177125
Epoch 3: loss = 1.856153832244873
Epoch 4: loss = 1.8473761486053466
Epoch 5: loss = 1.8295520067214965
Epoch 6: loss = 1.8261387882232667
Epoch 7: loss = 1.818788179206848
Epoch 8: loss = 1.8185432174682616
Epoch 9: loss = 1.8100512769699098


# Testing

In [115]:
test_total = dataset['test']

test_data_x = []
test_data_y = []

for i in random.sample(range(test_total.shape[0]), 1000):
    tmp = list(test_total[i].values())
    test_data_x.append(tmp[2])
    test_data_y.append(tmp[0])

sentences = []
for i in range(len(test_data_x)):
    sentences.append(cleanSentence(test_data_x[i]))

sentences = torch.tensor(sentences)
labels = torch.tensor(test_data_y)

test_dataset = TensorDataset(sentences, labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [124]:
with torch.no_grad():
    count = 0
    for step, batch in enumerate(test_dataloader):
        x, y = batch
        y_pred = lstm(x)
        y_label = torch.argmax(y_pred, dim=1)
        count += torch.sum(y == y_label)
    print(f'Accuracy of the LSTM model: {count * 100 / sentences.shape[0]:.2f} %')

Accuracy of the LSTM model: 92.30 %
