In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import logging
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
trainingDF = pd.read_csv("training_corpus.csv")
trainingDF.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,0,comment limited generally first season 1959-60...,1,comment limit gener first season 1959-60<br />...
1,0,writer ever happened baby jane hush hush sweet...,1,writer ever happen babi jane hush hush sweet c...
2,0,curious know critics responded rousing inspiri...,1,curiou know critic respond rous inspir film we...
3,0,agree mr caruso jr lanzas finest voice god off...,1,agre mr caruso jr lanza finest voic god offer ...
4,0,movie fictional soap opera fast funny say anyt...,1,movi fiction soap opera fast funni say anyth e...


In [3]:
testingDF = pd.read_csv("testing_corpus.csv")
testingDF.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,0,movie excellent save scenes esposito enjoyed b...,1,movi excel save scene esposito enjoy brought t...
1,0,take look faces alongside entrance jail they'r...,1,take look face alongsid entranc jail they'r fa...
2,0,wonderful story seen families story acting pro...,1,wonder stori seen famili stori act product val...
3,0,almost 4 years events 911 asked comes mind day...,1,almost 4 year event 911 ask come mind day peop...
4,0,pretty clever well-acted version modern 30s wo...,1,pretti clever well-act version modern 30 woman...


In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [5]:
# Skipping validation split
train_text = trainingDF["Review_Text"].values
test_text = testingDF["Review_Text"].values

In [6]:
train_labels = trainingDF["isPos"].values
test_labels = testingDF["isPos"].values

In [7]:
# NOTE DID NOT USE STEMMED TEXT 
train_encodings = tokenizer(list(train_text), truncation = True, padding = True)
test_encodings = tokenizer(list(test_text), truncation = True, padding = True)

In [10]:
len(tokenizer)

30522

In [11]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

In [13]:
#train_dataset[24000]["input_ids"].size()

In [14]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=25, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=25, shuffle=False)

In [15]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [46]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        #print(output.shape)
        hidden.squeeze_(0)
        output = self.fc(hidden)
        return output

In [47]:
model = RNN(input_dim=30522, embedding_dim=128, hidden_dim=256, output_dim=2)

In [48]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [49]:
n_total_steps = len(train_loader)
num_epochs = 1
for epoch in range(num_epochs):
    for i, sample in enumerate(train_loader):
        text = sample["input_ids"]
        label = sample["labels"]
        #print(text.shape)
        
        #break
        outputs = model(text)
        #print(text)
        #print(outputs.dim())
        loss = criterion(outputs, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print("Epoch " + str(((epoch+1)/num_epochs)) + "Step " + str((i+1)/n_total_steps) + ", Loss: " + str(loss.item()))

ValueError: Using a target size (torch.Size([25])) that is different to the input size (torch.Size([25, 2])) is deprecated. Please ensure they have the same size.