<a href="https://colab.research.google.com/github/JonathanSum/TorchAudioNotes/blob/main/Representing_text_as_Tensors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
!pip install -r https://raw.githubusercontent.com/MicrosoftDocs/pytorchfundamentals/main/nlp-pytorch/requirements.txt



In [36]:
import torch
import torchtext
import os
import collections
os.makedirs('./data',exist_ok=True)
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
classes = ['World', 'Sports', 'Business', 'Sci/Tech']

In [37]:
next(train_dataset)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [38]:
for i,x in zip(range(5),train_dataset):
    print(i)
    print(f"**{classes[x[0]]}** -> {x[1]}")



0
**Sci/Tech** -> Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
1
**Sci/Tech** -> Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
2
**Sci/Tech** -> Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.
3
**Sci/Tech** -> Oil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic m

In [39]:
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
train_dataset = list(train_dataset)
test_dataset = list(test_dataset)

In [40]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
tokenizer('He said: hello')

['he', 'said', 'hello']

In [41]:
tokenizer('Happy Sugar Life (Japanese: ハッピーシュガーライフ, Hepburn: Happī Shugā Raifu) is a Japanese psychological thriller manga series written and illustrated by Tomiyaki Kagisora.')

['happy',
 'sugar',
 'life',
 '(',
 'japanese',
 'ハッピーシュガーライフ',
 ',',
 'hepburn',
 'happī',
 'shugā',
 'raifu',
 ')',
 'is',
 'a',
 'japanese',
 'psychological',
 'thriller',
 'manga',
 'series',
 'written',
 'and',
 'illustrated',
 'by',
 'tomiyaki',
 'kagisora',
 '.']

In [42]:
counter = collections.Counter()
for (label, line) in train_dataset:
    counter.update(tokenizer(line))

In [43]:
torchtext.vocab.Vocab??

In [44]:
vocab = torchtext.vocab.Vocab(counter, min_freq=1)

In [45]:
vocab_size = len(vocab)
print(f"Vocab size is {vocab_size}")

def encode(x):
    return [vocab.stoi[s] for s in tokenizer(x)]

encode('I love to play with my words')

Vocab size is 95812


[283, 2321, 5, 337, 19, 1301, 2357]

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
        'I like hot dogs.',
        'The dog ran fast.',
        'Its hot outside.',
    ]
vectorizer.fit_transform(corpus)
vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()

array([[1, 1, 0, 2, 0, 0, 0, 0, 0]])

In [47]:
vectorizer.transform(['like hot. its its ran ran']).toarray()

array([[0, 0, 0, 1, 2, 1, 0, 2, 0]])

In [48]:
len(vocab)

95812

In [49]:
r1 = torch.zeros(vocab_size, dtype=torch.float32)

In [50]:
train_dataset[0]

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [51]:
vocab_size = len(vocab)

def to_bow(text,bow_vocab_size=vocab_size):
    res = torch.zeros(bow_vocab_size,dtype=torch.float32)
    for i in encode(text):
        if i<bow_vocab_size:
            res[i] += 1
    return res

print(to_bow(train_dataset[0][1]))

tensor([0., 0., 2.,  ..., 0., 0., 0.])


In [52]:
vocab_size

95812

In [53]:
from torch.utils.data import DataLoader
import numpy as np 

# this collate function gets list of batch_size tuples, and needs to 
# return a pair of label-feature tensors for the whole minibatch
def bowify(b):
    return (
            torch.LongTensor([t[0]-1 for t in b]),
            torch.stack([to_bow(t[1]) for t in b])
    )

train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=bowify, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=bowify, shuffle=True)

In [54]:
net = torch.nn.Sequential(torch.nn.Linear(vocab_size,4),torch.nn.LogSoftmax(dim=1))

In [55]:
train_dataset[0]


(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [56]:
def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.NLLLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for labels,features in dataloader:
        optimizer.zero_grad()
        out = net(features)
        loss = loss_fn(out,labels) #cross_entropy(out,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss
        _,predicted = torch.max(out,1)
        acc+=(predicted==labels).sum()
        count+=len(labels)
        i+=1
        if i%report_freq==0:
            print(f"{count}: acc={acc.item()/count}")
        if epoch_size and count>epoch_size:
            break
    return total_loss.item()/count, acc.item()/count

In [57]:
train_epoch(net,train_loader,epoch_size=15000)

3200: acc=0.8125
6400: acc=0.84265625
9600: acc=0.8567708333333334


KeyboardInterrupt: ignored

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
        'I like hot dogs.',
        'The dog ran fast.',
        'Its hot outside.',
    ]
vectorizer.fit_transform(corpus)
print("Vocabulary:\n",vectorizer.vocabulary_)
vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()



Vocabulary:
 {'like': 5, 'hot': 3, 'dogs': 1, 'the': 8, 'dog': 0, 'ran': 7, 'fast': 2, 'its': 4, 'outside': 6}


array([[1, 1, 0, 2, 0, 0, 0, 0, 0]])

In [61]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
corpus = [
        'I like hot dogs.',
        'The dog ran fast.',
        'Its hot outside.',
    ]
bigram_vectorizer.fit_transform(corpus)
print("Vocabulary:\n",bigram_vectorizer.vocabulary_)
bigram_vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()


Vocabulary:
 {'i': 7, 'like': 11, 'hot': 4, 'dogs': 2, 'i like': 8, 'like hot': 12, 'hot dogs': 5, 'the': 16, 'dog': 0, 'ran': 14, 'fast': 3, 'the dog': 17, 'dog ran': 1, 'ran fast': 15, 'its': 9, 'outside': 13, 'its hot': 10, 'hot outside': 6}


array([[1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [62]:
counter = collections.Counter()
for (label, line) in train_dataset:
    l = tokenizer(line)
    counter.update(torchtext.data.utils.ngrams_iterator(l,ngrams=2))
    
bi_vocab = torchtext.vocab.Vocab(counter, min_freq=1)

print("Bigram vocabulary length = ",len(bi_vocab))

Bigram vocabulary length =  1308844


In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit_transform(corpus)
vectorizer.transform(['My dog likes hot dogs on a hot day.']).toarray()

array([[0.43381609, 0.        , 0.43381609, 0.        , 0.65985664,
        0.43381609, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

The 0.4381609 is for the day, dog, and like because we appeared once.
But the 0.65985664 is for hot.

In [74]:
vectorizer.transform(['day dog like hot hot']).toarray()

array([[0.43381609, 0.        , 0.        , 0.        , 0.65985664,
        0.        , 0.        , 0.        , 0.        , 0.43381609,
        0.43381609, 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [64]:
print("Vocabulary:\n",vectorizer.vocabulary_)

Vocabulary:
 {'like': 9, 'hot': 4, 'dogs': 2, 'like hot': 10, 'hot dogs': 5, 'the': 14, 'dog': 0, 'ran': 12, 'fast': 3, 'the dog': 15, 'dog ran': 1, 'ran fast': 13, 'its': 7, 'outside': 11, 'its hot': 8, 'hot outside': 6}
