In [35]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd

torch.backends.cudnn.deterministic = True

In [67]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650'

In [68]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 50000
LEARNING_RATE = 0.005
BATCH_SIZE = 125
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 125
HIDDEN_DIM = 250
NUM_CLASSES = 2

In [72]:
torch.device(0)

device(type='cuda', index=0)

In [5]:
#!pip install torchtext==0.9.1

Collecting torchtext==0.9.1
  Downloading torchtext-0.9.1-cp39-cp39-manylinux1_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torch==1.8.1
  Downloading torch-1.8.1-cp39-cp39-manylinux1_x86_64.whl (804.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m804.1/804.1 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.13.1
    Uninstalling torch-1.13.1:
      Successfully uninstalled torch-1.13.1
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.14.1
    Uninstalling torchtext-0.14.1:
      Successfully uninstalled torchtext-0.14.1
Successfully installed torch-1.8.1 torchtext-0.9.1


In [5]:
#!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
### Defining the feature processing

TEXT = torchtext.legacy.data.Field(
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en_core_web_sm'
)

### Defining the label processing

LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

In [50]:
trainingDF = pd.read_csv("training_corpus.csv")
trainingDF = trainingDF.drop(columns=["Stemmed_Review_Text"])
trainingDF.head()

Unnamed: 0,Review_Text,isPos
0,comment limited generally first season 195960b...,1
1,writer ever happened baby jane hush hush sweet...,1
2,curious know critics responded rousing inspiri...,1
3,agree mr caruso jr lanzas finest voice god off...,1
4,movie fictional soap opera fast funny say anyt...,1


In [51]:
testingDF = pd.read_csv("testing_corpus.csv")
testingDF = testingDF.drop(columns=["Stemmed_Review_Text"])
testingDF.head()

Unnamed: 0,Review_Text,isPos
0,movie excellent save scenes esposito enjoyed b...,1
1,take look faces alongside entrance jail theyre...,1
2,wonderful story seen families story acting pro...,1
3,almost 4 years events 911 asked comes mind day...,1
4,pretty clever wellacted version modern 30s wom...,1


In [52]:
trainingDF.to_csv("pure_training.csv", index=False)
testingDF.to_csv("pure_testing.csv", index=False)

In [53]:
trainingDF = pd.read_csv("pure_testing.csv")
trainingDF.head()

Unnamed: 0,Review_Text,isPos
0,movie excellent save scenes esposito enjoyed b...,1
1,take look faces alongside entrance jail theyre...,1
2,wonderful story seen families story acting pro...,1
3,almost 4 years events 911 asked comes mind day...,1
4,pretty clever wellacted version modern 30s wom...,1


In [54]:
testingDF = pd.read_csv("pure_testing.csv")
testingDF.head()

Unnamed: 0,Review_Text,isPos
0,movie excellent save scenes esposito enjoyed b...,1
1,take look faces alongside entrance jail theyre...,1
2,wonderful story seen families story acting pro...,1
3,almost 4 years events 911 asked comes mind day...,1
4,pretty clever wellacted version modern 30s wom...,1


In [55]:
Fields = [("REVIEW_TEXT", TEXT), ("isPos", LABEL)]

train_dataset = torchtext.legacy.data.TabularDataset(path="pure_training.csv", format="csv", skip_header=True, fields=Fields)
test_dataset = torchtext.legacy.data.TabularDataset(path="pure_testing.csv", format="csv", skip_header=True, fields=Fields)

In [56]:
print(vars(train_dataset.examples[0]))

{'REVIEW_TEXT': ['comment', 'limited', 'generally', 'first', 'season', '195960br', '/>br', '/>this', 'superb', 'series', 'one', 'first', 'televised', 'color', 'highly', 'influential', 'persuading', 'americans', 'buy', 'color', 'television', 'set', '$', '800', '1959', 'equivalent', '$', '3000', 'today', 'many', 'us', 'would', 'pay', 'much', 'privilege', 'watching', 'show', 'transmitted', 'cathode', 'ray', 'picture', 'tube', '17inch', 'screen', 'eleven', 'series', 'began', 'watched', 'beginningbr', '/>br', '/>watching', '50', 'years', 'later', 'several', 'things', 'come', 'mind', 'first', 'many', 'story', 'lines', 'involve', 'comstock', 'lode', 'heyday', 'silver', 'mining', 'dates', '1859', '1859', 'weapons', 'clothes', 'part', 'authentic', 'haircuts', 'left', 'discussion', 'that', 's', 'basically', 'nitpickbr', '/>br', '/>and', 'would', 'impossible', 'ben', 'arrived', 'lake', 'tahoe', 'area', '1839', 'amassed', '100square', 'mile', 'ranch', 'next', 'twenty', 'years', 'pioneers', 'still'

In [57]:
TEXT.build_vocab(train_dataset, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_dataset)

print("Vocab size ", len(TEXT.vocab))
print("Num classes ", len(LABEL.vocab))

Vocab size  50002
Num classes  2


In [74]:
train_loader, test_loader = torchtext.legacy.data.BucketIterator.splits(
    (train_dataset, test_dataset),
    batch_size=BATCH_SIZE,
    sort_within_batch=False,
    sort_key=lambda x: len(x.REVIEW_TEXT),
    device=torch.device(0)
)

In [75]:
for batch in train_loader:
    print("Matrix Size", batch.REVIEW_TEXT.size())
    print("Vector Size", batch.isPos.size())
    break

for batch in test_loader:
    print("Matrix Size", batch.REVIEW_TEXT.size())
    print("Vector Size", batch.isPos.size())
    break

Matrix Size torch.Size([486, 125])
Vector Size torch.Size([125])
Matrix Size torch.Size([19, 125])
Vector Size torch.Size([125])


In [76]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        #self.rnn = torch.nn.RNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):
        # text dim: [sentence length, batch size]
        
        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]
        
        output = self.fc(hidden)
        return output

In [77]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES # could use 1 for binary classification
)

model = model.to(torch.device(0))
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [78]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [80]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text = batch_data.REVIEW_TEXT.to(torch.device(0))
        labels = batch_data.isPos.to(torch.device(0))

        ### FORWARD AND BACK PROP
        logits = model(text)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, torch.device(0)):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, torch.device(0)):.2f}%')

Epoch: 001/015 | Batch 000/200 | Loss: 0.6910
Epoch: 001/015 | Batch 050/200 | Loss: 0.6928
Epoch: 001/015 | Batch 100/200 | Loss: 0.6917
Epoch: 001/015 | Batch 150/200 | Loss: 0.6920
training accuracy: 50.19%
Time elapsed: 0.52 min
Epoch: 002/015 | Batch 000/200 | Loss: 0.6899
Epoch: 002/015 | Batch 050/200 | Loss: 0.6935
Epoch: 002/015 | Batch 100/200 | Loss: 0.6941
Epoch: 002/015 | Batch 150/200 | Loss: 0.6905
training accuracy: 50.09%
Time elapsed: 1.05 min
Epoch: 003/015 | Batch 000/200 | Loss: 0.7039
Epoch: 003/015 | Batch 050/200 | Loss: 0.6895
Epoch: 003/015 | Batch 100/200 | Loss: 0.6920
Epoch: 003/015 | Batch 150/200 | Loss: 0.6878
training accuracy: 50.22%
Time elapsed: 1.59 min
Epoch: 004/015 | Batch 000/200 | Loss: 0.6922
Epoch: 004/015 | Batch 050/200 | Loss: 0.6914
Epoch: 004/015 | Batch 100/200 | Loss: 0.6873
Epoch: 004/015 | Batch 150/200 | Loss: 0.6982
training accuracy: 50.36%
Time elapsed: 2.25 min
Epoch: 005/015 | Batch 000/200 | Loss: 0.6914
Epoch: 005/015 | Batch