<a href="https://colab.research.google.com/github/IlyaGalyukshev/colab/blob/main/NLP2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

In [2]:
torch.empty(5, 3)

tensor([[ 9.4797e-38,  1.5168e-36,  1.5404e-36],
        [ 1.5169e-36,  6.4694e-39,  3.0545e-27],
        [ 2.8422e-14,  2.5798e+21,  6.9780e+22],
        [ 7.2251e+28,  1.7812e-37, -9.0072e+15],
        [ 3.0881e+29,  4.5840e+30,  4.2039e-45]])

In [3]:
torch.tensor([1, 2, 3])[0].item()

1

In [4]:
if torch.cuda.is_available():
  device = torch.device('cuda')
  x = torch.randint(10, 100, (10, 10), device=device)
  print(x)
  print(x.to('cpu'))

In [5]:
torch.cuda.is_available()

False

In [6]:
x = torch.randn(3, requires_grad=True)
y = x * 4

while y.data.norm() < 1000:
  y *= 2

y

tensor([ 483.6919,  728.2634, -635.9540], grad_fn=<MulBackward0>)

In [7]:
print(x.requires_grad)
print((x**2).requires_grad)

with torch.no_grad():
  print((x**2).requires_grad)

True
True
False


In [8]:
print(x.requires_grad)
y = x.detach()
print(y.requires_grad)

True
False


In [9]:
!pip install torchtext



In [10]:
from torchtext import data
from torch.nn import functional as F
import torch

In [11]:
if torch.cuda.is_available():
  DEVICE = torch.device('cuda')
else:
  DEVICE = torch.device('cpu')

DEVICE

device(type='cpu')

In [12]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [13]:
import nltk

nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [14]:
import re
import os

In [15]:
POS = 'pos'
NEG = 'neg'

'/root/nltk_data/corpora/movie_reviews/neg'

In [16]:
text_sentiments = (POS, NEG)
train_data_list = []
test_data_list = []
examples = []

for sentiment in text_sentiments:
  for filename in os.listdir(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment)):
    with open(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment, filename), 'r', encoding='utf-8') as file:
      examples.append({'text': file.read().strip(), 'sentiment': int(sentiment == POS)})

examples[0]

{'text': '" good will hunting " is two movies in one : an independent take on the struggle of four boston pals and a traditional hollywood , " prodigy child " film complete with upbeats , downfalls , sporadically moving situations and plenty , plenty of shtick . \nunusually directed by gus van sant , " good will hunting " overcomes the banalities of its story by affirming the emergence of fresh , new talent . \nthe film stars matt damon as will hunting as a mathematical , rebellious whiz kid inadvertly discovered by a college professor ( stellan skarsgard ) , who places him under psychological supervision with robin williams . \nin a nutshell , that\'s it . \nthe core of the " good will hunting " is damon , who infuses the script ( co- written by " chasing amy\'s " ben affleck ) with just the right amount of warmth , sensitivity and humanity to accentuate his position as a refreshing multi- talented performer . \nbut it\'s the acting that hits the mark , and damon hits all the right no

In [17]:
import pandas as pd

In [18]:
examples_df = pd.DataFrame(examples)
examples_df

Unnamed: 0,text,sentiment
0,""" good will hunting "" is two movies in one : a...",1
1,""" when it's cold , molecules aren't moving . \...",1
2,"scarface , a remake of the 1932 film of the sa...",1
3,"just in time for halloween and christmas , the...",1
4,"usually when a blockbuster comes out , it's lo...",1
...,...,...
1995,"there's a 1 , 000-foot tidal wave at the end o...",0
1996,i guess that if a very wild bachelor party had...,0
1997,woof ! too bad that leap of faith was the titl...,0
1998,i saw this film on christmas day expecting an ...,0


In [19]:
examples_df = examples_df.sample(frac=1)
examples_df

Unnamed: 0,text,sentiment
798,"lisa cholodenko's "" high art , "" is an intelli...",1
1608,it used to be that not just anyone could becom...,0
461,films adapted from comic books have had plenty...,1
481,"allen , star of many a brian depalma movie in ...",1
280,people who enjoy science fiction are often fac...,1
...,...,...
268,""" a private matter "" is based on the true stor...",1
1378,you would think that this film's dismal failur...,0
66,"ingredients : lost parrot trying to get home ,...",1
699,there's an old saying that states something ab...,1


In [20]:
train_df = examples_df.sample(frac=0.7)
test_df = examples_df.drop(index=train_df.index)
train_texts, train_labels = train_df['text'].values, train_df['sentiment'].values
test_texts, test_labels = test_df['text'].values, test_df['sentiment'].values

In [21]:
from typing import List, Dict, Any, Iterable
from collections import Counter, OrderedDict
import math
from itertools import islice
import torch.nn.functional as F

In [55]:
class TfIdfVectorizer:

  def __init__(self, lower=True, tokenizer_pattern=r'(?i)\b[a-z]{2,}\b'):
    self.lower = lower
    self.tokenizer_pattern = re.compile(tokenizer_pattern)
    self.vocab_df = OrderedDict()

  def __tokenize(self, text: str) -> List[str]:
    return self.tokenizer_pattern.findall(text.lower() if self.lower else text)

  def fit(self, texts: Iterable[str]):
    term_id = 0
    for doc_idx, doc in enumerate(texts):
      tokenized = self.__tokenize(doc)
      for term in tokenized:
          if term not in self.vocab_df:
            self.vocab_df[term] = {}
            self.vocab_df[term]['doc_ids'] = {doc_idx}
            self.vocab_df[term]['doc_count'] = 1
            self.vocab_df[term]['id'] = term_id
            term_id += 1
          elif doc_idx not in self.vocab_df[term]['doc_ids']:
            self.vocab_df[term]['doc_ids'].add(doc_idx)
            self.vocab_df[term]['doc_count'] += 1
    texts_len = len(texts)
    for term in self.vocab_df:
      self.vocab_df[term]['idf'] = math.log(texts_len / self.vocab_df[term]['doc_count'])


  def transform(self, texts: Iterable[str]) -> torch.sparse_coo_tensor:
    values = []
    doc_indices = []
    term_indices = []
    for doc_idx, raw_doc in enumerate(texts):
      term_counter = {}
      for token in self.__tokenize(raw_doc):
        if token in self.vocab_df:
          term = self.vocab_df[token]
          term_idx = term['id']
          term_idf = term['idf']
          if term_idx not in term_counter:
            term_counter[term_idx] = term_idf
          else:
            term_counter[term_idx] += term_idf
      term_indices.extend(term_counter.keys())
      values.extend(term_counter.values())
      doc_indices.extend([doc_idx] * len(term_counter))
    indices = torch.LongTensor([doc_indices, term_indices], device=DEVICE)
    values_tensor = torch.LongTensor(values, device=DEVICE)
    tf_idf = torch.sparse_coo_tensor(indices, values_tensor, torch.Size([len(texts), len(self.vocab_df)]), device=DEVICE)
    return tf_idf

In [54]:
%%time
vectorizer = TfIdfVectorizer()
vectorizer.fit(train_texts)

CPU times: user 2.18 s, sys: 12.4 ms, total: 2.19 s
Wall time: 2.27 s


In [24]:
%%time
train_data = vectorizer.transform(train_texts)
test_data = vectorizer.transform(test_texts)

CPU times: user 6.13 s, sys: 12.4 ms, total: 6.14 s
Wall time: 9.35 s


In [25]:
from torch.utils.data import Dataset, DataLoader

In [26]:
train_data_loader = DataLoader(train_texts, batch_size=64)
test_data_loader = DataLoader(test_texts, batch_size=64)

In [27]:
def batch(iterable, n=1):
  l = len(iterable)
  for ndx in range(0, l, n):
    yield iterable[ndx:min(ndx + n, l)]

In [28]:
from torch import nn

class LogisticRegressionModel(nn.Module):

  def __init__(self, input_dim, output_dim):
    super(LogisticRegressionModel, self).__init__()
    self.linear = nn.Linear(input_dim, output_dim)

  def forward(self, x):
    out = F.softmax(self.linear(x), dim=1)
    return out

In [29]:
model = LogisticRegressionModel(len(vectorizer.vocab_df), 2)

In [30]:
criterion = nn.CrossEntropyLoss()

In [31]:
lr = 0.001
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [32]:
print(model.parameters())
print(len(list(model.parameters())))
print(list(model.parameters())[0])
print(list(model.parameters())[1])

<generator object Module.parameters at 0x7d78302cd7e0>
2
Parameter containing:
tensor([[-0.9420],
        [-0.1962]], requires_grad=True)
Parameter containing:
tensor([-0.4803, -0.2667], requires_grad=True)


In [33]:
num_epochs = 5

iteration = 0

for epoch in range(num_epochs):
  print(f'Epoch: {epoch}')
  for i, (texts, labels) in enumerate(zip(train_data_loader, batch(train_labels, 64))):
    labels = torch.LongTensor(labels)
    texts = F.normalize(vectorizer.transform(texts).to(torch.float).to_dense()).requires_grad_()

    optimizer.zero_grad()
    outputs = model(texts)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    iteration += 1

    if iteration % 50 == 0:
      correct = 0
      total = 0
      for test_texts, test_labels_batch in zip(test_data_loader, batch(test_labels, 64)):
        test_texts = F.normalize(vectorizer.transform(test_texts).to(torch.float).to_dense())
        test_labels_batch = torch.Tensor(test_labels_batch).to(torch.long)
        outputs = model(test_texts)
        _, predicted = torch.max(outputs.data, 1)
        total += test_labels_batch.size(0)
        correct += (predicted == test_labels_batch).sum()
      accuracy = 100 * correct / total
      print(f'Iteration: {iteration}, Loss: {loss.item()}, Accuracy: {accuracy}')

Epoch: 0
Epoch: 1
Epoch: 2
Iteration: 50, Loss: 0.6857889890670776, Accuracy: 48.83333206176758
Epoch: 3
Epoch: 4
Iteration: 100, Loss: 0.6903298497200012, Accuracy: 48.83333206176758
