In [1]:
import torch
import torchtext
import os
import collections
os.makedirs('/data', exist_ok=True)
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
classes = ['World', 'Sports', 'Buisness', 'Sci/Tech']

ModuleNotFoundError: No module named 'torchtext'

In [None]:
list(train_dataset[0])

In [None]:
for x in train_dataset[:5]:
    print(f"**{classes[x[0]]}** -> {x[1]}")

In [None]:
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
train_dataset = list(train_dataset)
test_dataset = list(test_dataset)

In [None]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
tokenizer('He said: hello')

In [None]:
counter = collections.Counter()
for (label, line) in train_dataset:
    counter.update(tokenizer(line))
vocab = torchtext.vocab.vocab(counter, min_freq=1)

In [None]:
vocab_size = len(vocab)

print(f"Vocab size if {vocab_size}")

stoi = vocab.get_stoi()

def encode(x):
    return [stoi[s] for s in tokenizer(x)]

encode('I love to play with my words')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
    'I like hot dogs.',
    'The dog ran fast.',
    'Its hot outside.',
]
vectorizer.fit_transform(corpus)
vectorizer.transform(['My dog likes hot dogs on a hot day']).toarray()


In [None]:
vocab_size = len(vocab)

def to_bow(text, bow_vocab_size=vocab_size):
    res = torch.zeros(bow_vocab_size,dtype=torch.float32)
    for i in encode(text):
        if i<bow_vocab_size:
            res[i] += 1
    return res

print(to_bow(train_dataset[0][1]))

In [None]:
from torch.utils.data import DataLoader
import numpy as np

def bowify(b):
    return (
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([to_bow(t[1] for t in b)])
    )

train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=bowify, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=bowify, shuffle=True)

In [None]:
net = torch.nn.Sequential(torch.nn.Linear(vocab_size, 4), torch.nn.LogSoftmax(dim=1))

In [None]:
def train_epoch(net, dataloader,lr=0.01,optimizer=None,loss_fn=torch.nn.NLLLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for labels,features in dataloader:
        optimizer.zero_grad()
        out = net(features)
        loss = loss_fn(out,labels)
        loss.backward()
        optimizer.step()
        total_loss += loss
        _,predicted = torch.max(out, 1)
        acc+=(predicted==labels).sum()
        count += len(labels)
        i += 1
        if i%report_freq==0:
            print(f"{count}: acc={acc.item()/count}")
        if epoch_size and count>epoch_size:
            break
    return total_loss.item()/count, acc.item()/count

In [None]:

train_epoch(net,train_loader,epoch_size=15000)

In [None]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
corpus = [
    'I like hot dogs.',
    'The dog ran fast.',
    'Its hot outside.',
]
bigram_vectorizer.fit_transform(corpus)
print("Vocabulary:\n",bigram_vectorizer.vocabulary_)
bigram_vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()

In [None]:
counter = collections.Counter()
for (labels, line) in train_dataset:
    l = tokenizer(line)
    counter.update(torchtext.data.utils.ngrams_iterator(l,ngrams=2))

bi_vocab = torchtext.vocab.vocab(counter, min_freq=1)

print("Bigram vocabulary length = ",len(bi_vocab))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit_transform(corpus)
vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()