In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [1]:
import torch
from transformers import BertTokenizer
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
trainingDF = pd.read_csv("pure_training.csv")
trainingDF.head()

Unnamed: 0,Review_Text,isPos
0,comment limited generally first season 195960b...,1
1,writer ever happened baby jane hush hush sweet...,1
2,curious know critics responded rousing inspiri...,1
3,agree mr caruso jr lanzas finest voice god off...,1
4,movie fictional soap opera fast funny say anyt...,1


In [4]:
testingDF = pd.read_csv("pure_testing.csv")
testingDF.head()

Unnamed: 0,Review_Text,isPos
0,movie excellent save scenes esposito enjoyed b...,1
1,take look faces alongside entrance jail theyre...,1
2,wonderful story seen families story acting pro...,1
3,almost 4 years events 911 asked comes mind day...,1
4,pretty clever wellacted version modern 30s wom...,1


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
# Skipping validation split
train_text = trainingDF["Review_Text"].values
test_text = testingDF["Review_Text"].values

In [7]:
train_labels = trainingDF["isPos"].values
test_labels = testingDF["isPos"].values

In [8]:
# NOTE DID NOT USE STEMMED TEXT 
train_encodings = tokenizer(list(train_text), truncation = True, padding = True)
test_encodings = tokenizer(list(test_text), truncation = True, padding = True)

In [None]:
train_encodings

In [8]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

In [10]:
train_dataset[24000]["input_ids"].size()

torch.Size([512])

In [11]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=25, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=25, shuffle=False)

In [12]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [33]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

        #self.input2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        #self.input2output = nn.Linear(input_size + hidden_size, output_size)
        #self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        print(x.size())
        print(x.size(0))
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        print("Hello " + str(x))
        print("nah " + str(h0))
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]
        out = self.fc(out)
        return out
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [34]:
model = RNN(512, 128, 1, 2)

In [35]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [36]:
n_total_steps = len(train_loader)
num_epochs = 1
for epoch in range(num_epochs):
    for i, sample in enumerate(train_loader):
        text = sample["input_ids"]
        label = sample["labels"]
        #print(text.shape)
        
        #break
        outputs = model(text)
        loss = criterion(outputs, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print("Epoch " + str(((epoch+1)/num_epochs)) + "Step " + str((i+1)/n_total_steps) + ", Loss: " + str(loss.item()))

torch.Size([25, 512])
25
Hello tensor([[  101,  2412,  4687,  ...,     0,     0,     0],
        [  101,  3185,  2428,  ...,     0,     0,     0],
        [  101,  3849,  2843,  ...,     0,     0,     0],
        ...,
        [  101,  2172,  5587,  ...,     0,     0,     0],
        [  101,  2093,  2112,  ...,     0,     0,     0],
        [  101, 18783,  2338,  ...,     0,     0,     0]])
nah tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])


RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor