<a href="https://colab.research.google.com/github/IlyaGalyukshev/colab/blob/main/NLP2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

In [None]:
torch.empty(5, 3)

tensor([[1.0254e+35, 3.0634e-41, 0.0000e+00],
        [0.0000e+00, 1.0823e+35, 3.0634e-41],
        [9.3874e+33, 3.0634e-41, 5.8484e-18],
        [4.5671e-41, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00]])

In [None]:
torch.tensor([1, 2, 3])[0].item()

1

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')
  x = torch.randint(10, 100, (10, 10), device=device)
  print(x)
  print(x.to('cpu'))

In [None]:
torch.cuda.is_available()

False

In [None]:
x = torch.randn(3, requires_grad=True)
y = x * 4

while y.data.norm() < 1000:
  y *= 2

y

tensor([ 204.6327,  701.1570, -956.9349], grad_fn=<MulBackward0>)

In [None]:
print(x.requires_grad)
print((x**2).requires_grad)

with torch.no_grad():
  print((x**2).requires_grad)

True
True
False


In [None]:
print(x.requires_grad)
y = x.detach()
print(y.requires_grad)

True
False


In [None]:
!pip install torchtext



In [None]:
from torchtext import data
from torch.nn import functional as F
import torch

In [None]:
if torch.cuda.is_available():
  DEVICE = torch.device('cuda')
else:
  DEVICE = torch.device('cpu')

DEVICE

device(type='cpu')

In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
import nltk

nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
import re
import os

In [None]:
POS = 'pos'
NEG = 'neg'

In [None]:
text_sentiments = (POS, NEG)
train_data_list = []
test_data_list = []
examples = []

for sentiment in text_sentiments:
  for filename in os.listdir(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment)):
    with open(os.path.join(nltk.corpus.movie_reviews.root.path, sentiment, filename), 'r', encoding='utf-8') as file:
      examples.append({'text': file.read().strip(), 'sentiment': int(sentiment == POS)})

examples[0]

{'text': "the deer hunter , directed by michael cimino , is truly one of the greatest movies ever made . \na captivating drama about the lives of a group of friends from a small russian-american pennsylvania community , the deer hunter promises to be one of those movies that you will never forget . \nin this academy award winner for best picture of 1978 , outstanding performances are turned in by all actors , and cimino's brilliant directing provides the perfect vision into the character's lives . \nthe first act of the film provides us with an inside look into the lives of a group of men from a small community who work and hang out together . \nmichael , portrayed perfectly by robert de niro , is shown early on as the natural leader of the group . \nafter a days work , the men leave their jobs at a factory to head down to the local bar , where john ( george dzundza ) works . \nthree of the men , michael , steven ( john savage ) , and nick ( christopher walken ) will be leaving shortly

In [None]:
import pandas as pd

In [None]:
examples_df = pd.DataFrame(examples)
examples_df

Unnamed: 0,text,sentiment
0,"the deer hunter , directed by michael cimino ,...",1
1,it may seem weird to begin a film about glam r...,1
2,"quiz show , an almost perfectly accurate true ...",1
3,"if beavis and butthead had a favorite movie , ...",1
4,"after the press screening of "" moulin rouge , ...",1
...,...,...
1995,the second serial-killer thriller of the month...,0
1996,once upon a time jean-claude van damme was a d...,0
1997,what would you do if no one could see you ? \n...,0
1998,when you've run out of old tv shows to turn in...,0


In [None]:
examples_df = examples_df.sample(frac=1)
examples_df

Unnamed: 0,text,sentiment
395,disney cements their place in the forefront of...,1
1133,michael robbins' hardball is quite the cinemat...,0
98,dreamworks pictures presents a jinks/ cohen co...,1
1734,""" the animal "" is a marginally inspired comedy...",0
1463,"tri-star ; rated r ( language , sexual situati...",0
...,...,...
1631,the comet-disaster flick is a disaster alright...,0
1501,"well , here's a distasteful , thoroughly amate...",0
954,synopsis : captain picard and the crew of the ...,1
1016,"susan granger's review of "" two can play that ...",0


In [None]:
train_df = examples_df.sample(frac=0.7)
test_df = examples_df.drop(index=train_df.index)
train_texts, train_labels = train_df['text'].values, train_df['sentiment'].values
test_texts, test_labels = test_df['text'].values, test_df['sentiment'].values

In [60]:
from typing import List, Dict, Any, Iterable
from collections import Counter, OrderedDict
import math
from itertools import islice
import torch.nn.functional as F

In [64]:
class TfIdfVectorizer:

  def __init__(self, lower=True, tokenizer_pattern=r''):
    self.lower = lower
    self.tokenizer_pattern = re.compile(tokenizer_pattern)
    self.vocab_df = OrderedDict()

  def __tokenize(self, text: str) -> List[str]:
    return self.tokenizer_pattern.findall(text.lower() if self.lower else text)

  def fit(self, texts: Iterable[str]):
    term_id = 0
    for doc_idx, doc in enumerate(texts):
      tokenized = self.__tokenize(doc)
      for term in tokenized:
          if term not in self.vocab_df:
            self.vocab_df[term] = {}
            self.vocab_df[term]['doc_ids'] = {doc_idx}
            self.vocab_df[term]['doc_count'] = 1
            self.vocab_df[term]['id'] = term_id
            term_id += 1
          elif doc_idx not in self.vocab_df[term]['doc_ids']:
            self.vocab_df[term]['doc_ids'].add(doc_idx)
            self.vocab_df[term]['doc_count'] += 1
    texts_len = len(texts)
    for term in self.vocab_df:
      self.vocab_df[term]['idf'] = math.log(texts_len / self.vocab_df[term]['doc_count'])


  def transform(self, texts: Iterable[str]) -> torch.sparse.LongTensor:
    values = []
    doc_indices = []
    term_indices = []
    for doc_idx, raw_doc in enumerate(texts):
      term_counter = {}
      for token in self.__tokenize(raw_doc):
        if token in self.vocab_df:
          term = self.vocab_df[token]
          term_idx = term['id']
          term_idf = term['idf']
          if term_idx not in term_counter:
            term_counter[term_idx] = term_idf
          else:
            term_counter[term_idx] += term_idf
      term_indices.extend(term_counter.keys())
      values.extend(term_counter.values())
      doc_indices.extend([doc_idx] * len(term_counter))
    indices = torch.LongTensor([doc_indices, term_indices], device=DEVICE)
    values_tensor = torch.LongTensor(values, device=DEVICE)
    tf_idf = torch.sparse_coo_tensor(indices, values_tensor, torch.Size([len(texts), len(self.vocab_df)]), device=DEVICE)
    return tf_idf
    lower

In [65]:
%%time
vectorizer = TfIdfVectorizer()
vectorizer.fit(train_texts)

CPU times: user 2.5 s, sys: 1.47 ms, total: 2.5 s
Wall time: 3.29 s


In [66]:
%%time
train_data = vectorizer.transform(train_texts)
test_data = vectorizer.transform(test_texts)

CPU times: user 3.47 s, sys: 663 µs, total: 3.47 s
Wall time: 3.5 s


In [68]:
from torch.utils.data import Dataset, DataLoader

In [69]:
train_data_loader = DataLoader(train_texts, batch_size=64)
test_data_loader = DataLoader(test_texts, batch_size=64)

In [81]:
def batch(iterable, n=1):
  l = len(iterable)
  for ndx in range(0, l, n):
    yield iterable[ndx:min(ndx + n, l)]

In [71]:
from torch import nn

class LogisticRegressionModel(nn.Module):

  def __init__(self, input_dim, output_dim):
    super(LogisticRegressionModel, self).__init__()
    self.linear = nn.Linear(input_dim, output_dim)

  def forward(self, x):
    out = F.softmax(self.linear(x))
    return out

In [72]:
model = LogisticRegressionModel(len(vectorizer.vocab_df), 2)

In [73]:
criterion = nn.CrossEntropyLoss()

In [74]:
lr = 0.001
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [78]:
print(model.parameters())
print(len(list(model.parameters())))
print(list(model.parameters())[0])
print(list(model.parameters())[1])

<generator object Module.parameters at 0x7f4f51be6b20>
2
Parameter containing:
tensor([[-0.9420],
        [-0.1962]], requires_grad=True)
Parameter containing:
tensor([-0.4803, -0.2667], requires_grad=True)


In [86]:
num_epochs = 5

iteration = 0

for epoch in range(num_epochs):
  print(f'Epoch: {epoch}')
  for i, (texts, labels) in enumerate(zip(train_data_loader, batch(train_labels, 64))):
    labels = torch.LongTensor(labels)
    texts = F.normalize(vectorizer.transform(texts).to(torch.float).to_dense()).requires_grad_()

    optimizer.zero_grad()
    outputs = model(texts)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    iteration += 1

    if iteration % 50 == 0:
      correct = 0
      total = 0
      for test_texts, test_labels_batch in zip(test_data_loader, batch(test_labels, 64)):
        test_texts = F.normalize(vectorizer.transform(test_texts).to(torch.float).to_dense())
        test_labels_batch = torch.Tensor(test_labels_batch).to(torch.long)
        outputs = model(test_texts)
        _, predicted = torch.max(outputs.data, 1)
        total += test_labels_batch.size(0)
        correct += (predicted == test_labels_batch).sum()
      accuracy = 100 * correct / total
      print(f'Iteration: {iteration}, Loss: {loss.item()}, Accuracy: {accuracy}')

Epoch: 0


  out = F.softmax(self.linear(x))


Epoch: 1
Epoch: 2
Iteration: 50, Loss: 0.6936836242675781, Accuracy: 47.16666793823242
Epoch: 3
Epoch: 4
Iteration: 100, Loss: 0.6926589012145996, Accuracy: 47.16666793823242
