# Prereq

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
import nltk

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 4.9 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 53.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.4 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 45.9 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 42.5 MB/s 
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)
[K     |███████

In [None]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print('Using GPU')
    print('GPU count:', torch.cuda.device_count())
    print('GPU device:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")

Using CPU


# Getting the dataset ready

## Download dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("dbpedia_14")

Downloading:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading and preparing dataset d_bpedia14/dbpedia_14 (download: 65.18 MiB, generated: 191.44 MiB, post-processed: Unknown size, total: 256.62 MiB) to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e...


Downloading:   0%|          | 0.00/68.3M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset d_bpedia14 downloaded and prepared to /root/.cache/huggingface/datasets/d_bpedia14/dbpedia_14/2.0.0/7f0577ea0f4397b6b89bfe5c5f2c6b1b420990a1fc5e8538c7ab4ec40e46fa3e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
train_total = dataset['train']
# test_total = dataset['test']

train_data_x = []
train_data_y = []

for i in random.sample(range(train_total.shape[0]), 20000):
    tmp = list(train_total[i].values())
    train_data_x.append(tmp[2])
    train_data_y.append(tmp[0])

In [None]:
df = pd.DataFrame({"sentence": train_data_x, "class": train_data_y})
df.sample(10)

Unnamed: 0,sentence,class
13554,Belfast Harlequins is a multi-sports club loc...,1
9061,Time Tripping is an album (LP Vinyl) released...,11
17116,Bulbophyllum dagamense is a species of orchid...,10
7442,After the Rain (雨あがる Ame agaru) is a 1999 Jap...,12
8143,Southern Comfort is a 1974 album by jazz-fusi...,11
14760,The Nanny Diaries is a 2002 novel by Emma McL...,13
1772,Herbert Birchby Warburton (September 21 1916 ...,4
10979,The Odd Fellows Building in Red Bluff Califor...,6
7684,Medusa is a Kurt Austin novel of the series N...,13
3292,United Concordia is a dental insurance compan...,0


## Preprocessing dataset

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

text_embedder = api.load("glove-twitter-25")

# for gensim info
# https://github.com/kavgan/nlp-in-practice/blob/master/pre-trained-embeddings/Pre-trained%20embeddings.ipynb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import re
# regex = r"[^a-zA-Z0-9\-:;,.!?() ]+"
regex = r"[^a-zA-Z0-9,. ]+"
max_word_count = 100

def cleanSentence(sen):
    sen = sen.replace(".", ". ")
    sen = sen.replace(",", ", ")
    result = re.sub(regex, " ", sen).lower()
    result = word_tokenize(result)
    tmp = []
    for word in result:
        if(word in text_embedder and len(tmp) < max_word_count):
            tmp.append(text_embedder.vocab[word].index)
    tmp = tmp + [-1] * (max_word_count - len(tmp))
    return tmp

In [None]:
def classVector(label, count):
    tmp = [0] * count
    tmp[label] = 1
    return tmp

In [None]:
print(train_data_x[16737])
print(cleanSentence(train_data_x[16737]))
print(classVector(train_data_y[16737], 14))

 The Auster J/5 Adventurer is a British-built three-seat light high-wing monoplane of the late 1940s.
[13, 200678, 677, 87185, 32, 11, 4417, 7486, 1590, 4807, 1494, 685, 11864, 39, 13, 969, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
sentences = []
for i in range(len(train_data_x)):
    sentences.append(cleanSentence(train_data_x[i]))

labels = []
labelcount = max(train_data_y) + 1
for i in range(len(train_data_y)):
    labels.append(classVector(train_data_y[i], labelcount))

sentences = torch.tensor(sentences)
labels = torch.tensor(labels)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def collate_fn(data):
    feat, label = zip(*data)
    batch_size = len(feat)
    tok_size = len(feat[0])
    vec_size = 25
    features = torch.zeros((batch_size, tok_size, vec_size))
    for index, fex in enumerate(feat):
        for wordind in range(tok_size):
            if(fex[wordind] == -1):
                features[index, wordind, :] = torch.zeros(vec_size)
            else:
                features[index, wordind, :] = torch.tensor(text_embedder[text_embedder.index2word[fex[wordind]]])
    label = torch.stack(label)
    return features, label

train_dataset = TensorDataset(sentences, labels)
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model

# Training



In [None]:
for step, batch in enumerate(train_dataloader):
    x, y = batch
    # print(y)
    print(x.shape, y.shape)
    break