In [14]:
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import logging
import matplotlib.pyplot as plt
import pandas as pd

In [15]:
trainingDF = pd.read_csv("training_corpus.csv")
trainingDF.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,0,comment limited generally first season 1959-60...,1,comment limit gener first season 1959-60<br />...
1,0,writer ever happened baby jane hush hush sweet...,1,writer ever happen babi jane hush hush sweet c...
2,0,curious know critics responded rousing inspiri...,1,curiou know critic respond rous inspir film we...
3,0,agree mr caruso jr lanzas finest voice god off...,1,agre mr caruso jr lanza finest voic god offer ...
4,0,movie fictional soap opera fast funny say anyt...,1,movi fiction soap opera fast funni say anyth e...


In [16]:
testingDF = pd.read_csv("testing_corpus.csv")
testingDF.head()

Unnamed: 0.1,Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,0,movie excellent save scenes esposito enjoyed b...,1,movi excel save scene esposito enjoy brought t...
1,0,take look faces alongside entrance jail they'r...,1,take look face alongsid entranc jail they'r fa...
2,0,wonderful story seen families story acting pro...,1,wonder stori seen famili stori act product val...
3,0,almost 4 years events 911 asked comes mind day...,1,almost 4 year event 911 ask come mind day peop...
4,0,pretty clever well-acted version modern 30s wo...,1,pretti clever well-act version modern 30 woman...


In [17]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [18]:
from collections import Counter

In [19]:
trainingDF.Review_Text

0        comment limited generally first season 1959-60...
1        writer ever happened baby jane hush hush sweet...
2        curious know critics responded rousing inspiri...
3        agree mr caruso jr lanzas finest voice god off...
4        movie fictional soap opera fast funny say anyt...
                               ...                        
24995    loved first season quality went little bit sec...
24996    overall idea escape atlantis intriguing found ...
24997    man movie sucked big time even manage see hole...
24998    encompassing virtual reality potential compute...
24999    1973 remake classic 1944 billy wilder film dou...
Name: Review_Text, Length: 25000, dtype: object

In [20]:
def word_counter(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

In [21]:
counter = word_counter(trainingDF.Review_Text)

In [22]:
len(counter)

157459

In [23]:
# Skipping validation split
train_text = trainingDF["Review_Text"].values
test_text = testingDF["Review_Text"].values

In [24]:
train_text

array(["comment limited generally first season 1959-60<br /><br />this superb series one first televised color highly influential persuading americans buy color television set $800 1959 equivalent $3000 today many us would pay much privilege watching show transmitted cathode ray picture tube 17-inch screen eleven series began watched beginning<br /><br />watching 50 years later several things come mind first many story lines involve comstock lode heyday silver mining dates 1859 1859 weapons clothes part authentic (the haircuts left discussion) that's basically nitpick<br /><br />and would impossible ben arrived lake tahoe area 1839 amassed 100-square mile ranch next twenty years pioneers still trying solve sierra nevada problem late 1847 gold rush even begin two years later<br /><br />indians played native american actors john ford using native american actors 1920s bonanza producers could easily done thirty years later major nitpick me<br /><br />there time-line problems season 1 mark

In [25]:
train_labels = trainingDF["isPos"].values
test_labels = testingDF["isPos"].values

In [26]:
# NOTE DID NOT USE STEMMED TEXT 
train_encodings = tokenizer(list(train_text), truncation = True, padding = True)
test_encodings = tokenizer(list(test_text), truncation = True, padding = True)

In [9]:
len(tokenizer)

30522

In [27]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [28]:
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

In [13]:
#train_dataset[24000]["input_ids"].size()

In [29]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=25, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=25, shuffle=False)

In [30]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [46]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        #print(output.shape)
        hidden.squeeze_(0)
        output = self.fc(hidden)
        return output

In [56]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        # x-> (batch_size, seq, feature)

    def forward(self, x):
        h0 = torch.zeros(x.size(0), self.hidden_size)
        print(x.dtype)
        print(h0.dtype)
        
        out, _ = self.rnn(x, h0)
        #print(out)
        out = out[:, -1, :]
        print(out)
        out = self.fc(out)
        return out

In [57]:
model = RNN(input_size=512, hidden_size=128, num_layers=25, num_classes=2)

In [58]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [59]:
n_total_steps = len(train_loader)
num_epochs = 1
for epoch in range(num_epochs):
    for i, sample in enumerate(train_loader):
        text = sample["input_ids"]
        label = sample["labels"]
        #print(text.shape)
        #print(label.shape)
        
        #break
        outputs = model(text)
        #print(text)
        #print(outputs.dim())
        loss = criterion(outputs, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print("Epoch " + str(((epoch+1)/num_epochs)) + "Step " + str((i+1)/n_total_steps) + ", Loss: " + str(loss.item()))
    #break

torch.int64
torch.float32


RuntimeError: mat1 and mat2 must have the same dtype