# Prereq

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import pandas as pd
import random
import nltk

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 12.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 502 kB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 37.5 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 37.6 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 49.4 MB/s 
Collecting asynctest==0.13.0
  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.

In [3]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print('Using GPU')
    print('GPU count:', torch.cuda.device_count())
    print('GPU device:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")

Using GPU
GPU count: 1
GPU device: Tesla K80


# Getting the dataset ready

## Download dataset

In [4]:
from datasets import load_dataset

DBPedia Dataset

In [5]:
dataset = load_dataset("dbpedia_14")

train_total = dataset['train']

train_data_x = []
train_data_y = []

for i in random.sample(range(train_total.shape[0]), 20000):
    tmp = list(train_total[i].values())
    train_data_x.append(tmp[2])
    train_data_y.append(tmp[0])

Downloading:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading and preparing dataset d_bpedia14/dbpedia_14 (download: 65.18 MiB, generated: 191.44 MiB, post-processed: Unknown size, total: 256.62 MiB) to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e...


Downloading:   0%|          | 0.00/68.3M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset d_bpedia14 downloaded and prepared to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Yelp Review Dataset

In [75]:
dataset = load_dataset("yelp_review_full")

train_data_x = dataset['train']['text'][:40000]
train_data_y = dataset['train']['label'][:40000]

Reusing dataset yelp_review_full (/root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/13c31a618ba62568ec8572a222a283dfc29a6517776a3ac5945fb508877dde43)


  0%|          | 0/2 [00:00<?, ?it/s]

Amazon Polarity

In [95]:
dataset = load_dataset("amazon_polarity")

train_data_x = dataset['train']['content'][:40000]
train_data_y = dataset['train']['label'][:40000]

Downloading:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/860 [00:00<?, ?B/s]

Downloading and preparing dataset amazon_polarity/amazon_polarity (download: 656.45 MiB, generated: 1.66 GiB, post-processed: Unknown size, total: 2.30 GiB) to /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/56923eeb72030cb6c4ea30c8a4e1162c26b25973475ac1f44340f0ec0f2936f4...


Downloading:   0%|          | 0.00/688M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset amazon_polarity downloaded and prepared to /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/56923eeb72030cb6c4ea30c8a4e1162c26b25973475ac1f44340f0ec0f2936f4. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [99]:
df = pd.DataFrame({"sentence": train_data_x, "class": train_data_y})
df.sample(10)

Unnamed: 0,sentence,class
22769,"First off, I'm not a history buff. As much as ...",1
16928,This does not fit my Palm TX. I even turned th...,0
27751,"One of my favorite movies!!Unfortunately, the ...",0
5425,I wanted a reliable second battery for my came...,1
6645,The book was in great condition and it is a gr...,1
32242,I purchased this tele converter lens as an add...,0
4970,"Over the winter, i seem to gain alot of weight...",1
5433,I purchased this battery as a backup for a vac...,1
4875,Just wanted to mention a few things the other ...,1
2786,This book did give the reader some very intere...,1


## Preprocessing dataset

In [100]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

text_embedder = api.load("glove-twitter-25")

# for gensim info
# https://github.com/kavgan/nlp-in-practice/blob/master/pre-trained-embeddings/Pre-trained%20embeddings.ipynb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [101]:
import re
# regex = r"[^a-zA-Z0-9\-:;,.!?() ]+"
regex = r"[^a-zA-Z0-9,. ]+"
max_word_count = 100

def cleanSentence(sen):
    sen = sen.replace(".", ". ")
    sen = sen.replace(",", ", ")
    result = re.sub(regex, " ", sen).lower()
    result = word_tokenize(result)
    tmp = []
    for word in result:
        if(word in text_embedder and len(tmp) < max_word_count):
            tmp.append(text_embedder.vocab[word].index)
    tmp = tmp + [-1] * (max_word_count - len(tmp))
    return tmp

In [102]:
def classVector(label, count):
    tmp = [0] * count
    tmp[label] = 1
    return tmp

In [103]:
sentences = []
for i in range(len(train_data_x)):
    sentences.append(cleanSentence(train_data_x[i]))

labels = []
labelcount = max(train_data_y) + 1
for i in range(len(train_data_y)):
    labels.append(classVector(train_data_y[i], labelcount))

sentences = torch.tensor(sentences)
labels = torch.tensor(labels)

In [104]:
print(train_data_x[16737])
print(cleanSentence(train_data_x[16737]))
print(classVector(train_data_y[16737], labelcount))

The Which of Blackbird Pond was a very boring book. I almost fell asleep reading it! The plot was stupid. The charters were lame. This book was all around terribley written.
[13, 965, 39, 118948, 24006, 93, 11, 520, 1861, 1203, 1, 10, 1145, 2456, 1712, 1742, 33, 13, 9908, 93, 861, 1, 13, 226910, 377, 3213, 1, 53, 1203, 93, 75, 576, 6209, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[1, 0]


In [105]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pack_sequence

def collate_fn(data):
    feat, label = zip(*data)
    batch_size = len(feat)
    tok_size = len(feat[0])
    vec_size = 25
    features = []
    labels = []
    for index, fex in enumerate(feat):
        tmp = []
        for wordind in range(tok_size):
            if(fex[wordind] == -1):
                break
            else:
                tmp.append(torch.tensor(text_embedder[text_embedder.index2word[fex[wordind]]]))
        if(len(tmp) > 0):
            features.append(torch.stack(tmp))
            labels.append(label[index])
    labels = torch.stack(labels)
    return pack_sequence(features, enforce_sorted=False), labels

batch_size = 32

In [106]:
train_dataset = TensorDataset(sentences, labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model

In [107]:
class LSTMClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_dim, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.lstm = nn.LSTM(input_size=feature_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        h_0 = Variable(torch.zeros(self.num_layers, x.batch_sizes.shape[0], self.hidden_size).to(device))
        c_0 = Variable(torch.zeros(self.num_layers, x.batch_sizes.shape[0], self.hidden_size).to(device))
        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        h_n = h_n.view(-1, self.hidden_size)
        out = self.fc(h_n)
        out = self.softmax(out)
        return out

# Training



In [108]:
n_epochs = 10
lr = 0.01
feature_dim = 25
hidden_dim = 50
num_layers = 1

lstm = LSTMClassifier(feature_dim=feature_dim, hidden_dim=hidden_dim, num_layers=1, num_classes=labelcount)
lstm.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)

In [109]:
import time

for epoch in range(n_epochs):
    avg_loss = 0
    st = time.time()
    for step, batch in enumerate(train_dataloader):
        x, y = batch
        x = x.to(device)
        y = y.float().to(device)
        y_pred = lstm(x)
        optimizer.zero_grad()
        loss = criterion(y_pred, y)
        avg_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss /= len(train_dataloader)
    if(epoch % 1 == 0):
      print(f"Epoch {epoch}: loss = {avg_loss:.6f}, time taken = {time.time() - st:.3f}")

Epoch 0: loss = 0.598962, time taken = 91.805
Epoch 1: loss = 0.533107, time taken = 91.571
Epoch 2: loss = 0.518637, time taken = 91.571
Epoch 3: loss = 0.506581, time taken = 92.178
Epoch 4: loss = 0.499518, time taken = 92.250
Epoch 5: loss = 0.497359, time taken = 92.160
Epoch 6: loss = 0.495672, time taken = 91.604
Epoch 7: loss = 0.488276, time taken = 91.111
Epoch 8: loss = 0.488579, time taken = 92.045
Epoch 9: loss = 0.484378, time taken = 91.608


# Testing

DBPedia Dataset

In [None]:
test_total = dataset['test']

test_data_x = []
test_data_y = []

for i in random.sample(range(test_total.shape[0]), 4000):
    tmp = list(test_total[i].values())
    test_data_x.append(tmp[2])
    test_data_y.append(tmp[0])

Yelp Review Dataset

In [87]:
test_data_x = dataset['test']['text'][:8000]
test_data_y = dataset['test']['label'][:8000]

Amazon Polarity

In [110]:
test_data_x = dataset['test']['content'][:8000]
test_data_y = dataset['test']['label'][:8000]

In [111]:
sentences = []
for i in range(len(test_data_x)):
    sentences.append(cleanSentence(test_data_x[i]))

sentences = torch.tensor(sentences)
labels = torch.tensor(test_data_y)

def collate_fn(data):
    feat, label = zip(*data)
    batch_size = len(feat)
    tok_size = len(feat[0])
    vec_size = 25
    features = []
    for index, fex in enumerate(feat):
        tmp = []
        for wordind in range(tok_size):
            if(fex[wordind] == -1):
                break
            else:
                tmp.append(torch.tensor(text_embedder[text_embedder.index2word[fex[wordind]]]))
        if(len(tmp) == 0):
            tmp.append(torch.tensor(text_embedder[text_embedder.index2word[0]]))
        features.append(torch.stack(tmp))
    labels = torch.stack(label)
    return pack_sequence(features, enforce_sorted=False), labels

batch_size = 32

test_dataset = TensorDataset(sentences, labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [112]:
test_pred_labels = []

with torch.no_grad():
    count = 0
    for step, batch in enumerate(test_dataloader):
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        y_pred = lstm(x)
        y_label = torch.argmax(y_pred, dim=1)
        test_pred_labels.append(y_label)
        count += torch.sum(y == y_label).item()

print(f'Accuracy of the LSTM model: {count * 100 / sentences.shape[0]:.2f} %')

Accuracy of the LSTM model: 80.11 %


In [113]:
from sklearn.metrics import classification_report

test_true_labels = test_data_y
test_pred_labels = torch.cat(test_pred_labels).tolist()
target_names = [f"class {i}" for i in range(labelcount)]

print(classification_report(test_true_labels, test_pred_labels, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.79      0.81      0.80      3920
     class 1       0.81      0.79      0.80      4080

    accuracy                           0.80      8000
   macro avg       0.80      0.80      0.80      8000
weighted avg       0.80      0.80      0.80      8000



In [114]:
from google.colab import drive
drive.mount('/content/gdrive')

with open('/content/gdrive/My Drive/report.txt', 'w') as f:
    f.write(classification_report(test_true_labels, test_pred_labels, target_names=target_names))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Demo for DBPedia14

In [None]:
def getClass(sentence):
    # import re
    # regex = r"[^a-zA-Z0-9,. ]+"
    sentence = sentence.replace(".", ". ")
    sentence = sentence.replace(",", ", ")
    result = re.sub(regex, " ", sentence).lower()

    # import nltk
    # nltk.download('punkt')
    # from nltk.tokenize import word_tokenize
    result = word_tokenize(result)

    # import torch
    # from torch.nn.utils.rnn import pack_sequence

    tmp = []
    for word in result:
        if(word in text_embedder):
            tmp.append(torch.tensor(text_embedder[word]))
    features = pack_sequence([torch.stack(tmp)], enforce_sorted=False)

    with torch.no_grad():
        x = features.to(device)
        y_pred = lstm(x)
        label = torch.argmax(y_pred, dim=1).item()
    
    names = ["Company", "Educational Institution", "Artist", "Athlete", "Office Holder", "Means Of Transportation", "Building", "Natural Place", "Village", "Animal", "Plant", "Album", "Film", "Written Work"]
    return names[label]

In [None]:
sentence = "Lionel Messi is an amazing football player"
label = getClass(sentence)
print("The sentence is about " + ("an" if label[0] in ['A', 'E', 'I', 'O', 'U'] else "a") + " " + label)

The sentence is about an Athlete
