# CSE 256 Project
## baseline model
### Specific: CNN model or LSTM model

In [None]:
!pip install -U torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


## Standford Stentiment Treebank Data Preparing and Data Processing

In [None]:
# import libraries that will help us preprocess data, map to word embeddings
import torchtext
from torchtext.vocab import Vectors, GloVe

In [None]:
print(torch.__version__)

2.0.1+cu118


In [None]:
print(torchtext.__version__)

0.6.0


## Naive Bayes Model

In [None]:
# this will be our input x to the classifiers
text_bayes = torchtext.data.Field()

# this will be what we map to, the tag y
label_bayes = torchtext.data.Field(sequential=False)

In [None]:
# split the dataset into train, val, and test sets. Exclude neutral labels, so just positive or negative
train_bayes, val_bayes, test_bayes = torchtext.datasets.SST.splits(text_bayes, label_bayes, filter_pred=lambda ex: ex.label != 'neutral')

In [None]:
# each consists of a label and it's original words
print('len(train)', len(train_bayes))
print('vars(train[0])', vars(train_bayes[0]))

len(train) 6920
vars(train[0]) {'text': ['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'Century', "'s", 'new', '``', 'Conan', "''", 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'Arnold', 'Schwarzenegger', ',', 'Jean-Claud', 'Van', 'Damme', 'or', 'Steven', 'Segal', '.'], 'label': 'positive'}


In [None]:
# assign an index to each word and label (unique) kind of like countvectorizer
text_bayes.build_vocab(train_bayes)
label_bayes.build_vocab(train_bayes)
print('len(text_bayes.vocab)', len(text_bayes.vocab))
print('len(label_bayes.vocab)', len(label_bayes.vocab))

len(text_bayes.vocab) 16284
len(label_bayes.vocab) 3


In [None]:
# produce three batch iterators that iterate 10 examples at a time
train_bayes_iter, val_bayes_iter, test_bayes_iter = torchtext.data.BucketIterator.splits((train_bayes, val_bayes, test_bayes), batch_size=10, device=-1)



In [None]:
# Build the vocabulary with word embeddings
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec'
text_bayes.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

print("Word embeddings size ", text_bayes.vocab.vectors.size())
print("Word embedding of 'follows', first 10 dim ", text_bayes.vocab.vectors[text_bayes.vocab.stoi['follows']][:10])

Word embeddings size  torch.Size([16284, 300])
Word embedding of 'follows', first 10 dim  tensor([ 0.3925, -0.4770,  0.1754, -0.0845,  0.1396,  0.3722, -0.0878, -0.2398,
         0.0367,  0.2800])


In [None]:
batch_bayes = next(iter(train_bayes_iter))

In [None]:
print("Size of text batch [max sent length, batch size]", batch_bayes.text.size())
print("Second in batch", batch_bayes.text[:, 0])
print("Converted back to string: ", " ".join([text_bayes.vocab.itos[i] for i in batch_bayes.text[:, 0].data]))

Size of text batch [max sent length, batch size] torch.Size([41, 10])
Second in batch tensor([ 132,  124,   42,   87,    7, 2622,  179,  507,    9,  486,   35,   15,
         446,   15,  350, 1551,   10, 4567,    6, 9766,    2,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1])
Converted back to string:  What should have been a cutting Hollywood satire is instead about as fresh as last week 's issue of Variety . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [None]:
import numpy as np
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from torchtext.vocab import Vectors
from tqdm import tqdm_notebook

class NaiveBayes:
    def __init__(self, text):
        self.train_iter = train_bayes_iter
        self.test_iter = test_bayes_iter
        self.val_iter = val_bayes_iter
        self.array_like = np.zeros((train_bayes_iter.batch_size, len(text.vocab)))

    def binarize_occurrences(self, indices):
        occurrences = self.array_like.copy()
        for idx, entry in enumerate(indices): occurrences[idx][entry] = 1
        return occurrences

    def batch_to_input(self, batch, train = True):
        word_indices = batch.text.data.numpy().T
        x = self.binarize_occurrences(word_indices)
        if train:
            y = batch.label.data.numpy()
            return x, y
        else:
            return x

    def train_mnb(self, train_iter, val_iter, no_epochs):
        self.model = MultinomialNB(alpha=1.0, fit_prior=True)
        for epoch in tqdm_notebook(range(1, no_epochs+1)):
            for batch in train_iter:
                x, y = self.batch_to_input(batch, train = True)
                self.model.partial_fit(x, y, classes = [1,2])

            if epoch % 1 == 0:
                acc = self.validate(val_iter)
                print('Epoch ', epoch, '| Validation Accuracy: ', acc)
        print('Done training.')

    def test(self, test_iter):
        "All models should be able to be run with following command."
        upload, trues = [], []

        for batch in test_iter:
            x, y = self.batch_to_input(batch, train = False), batch.label
            probs = self.model.predict(x)
            upload += list(probs)
            trues += list(y.data)
        correct = sum([1 if i == j else 0 for i, j in zip(upload, trues)])
        accuracy = correct / len(trues)
        print('Test Accuracy: ', accuracy)

        with open("predictions.txt", "w") as f:
            for u in upload:
                f.write(str(u) + "\n")

    def validate(self, val_iter):
        y_p, y_t, correct = [], [], 0
        for batch in val_iter:
            x, y = self.batch_to_input(batch, train = False), batch.label
            probs = self.model.predict(x)[:len(y.data)]
            y_p += list(probs)
            y_t += list(y.data)
        correct = sum([1 if i == j else 0 for i, j in zip(y_p, y_t)])
        accuracy = correct / len(y_p)
        return accuracy

In [None]:
bayes_model = NaiveBayes(text_bayes)
bayes_model.train_mnb(train_bayes_iter, val_bayes_iter, 1)
bayes_model.test(test_bayes_iter)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(1, no_epochs+1)):


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch  1 | Validation Accuracy:  0.7981651376146789
Done training.
Test Accuracy:  0.8220757825370676
