In [1]:
import torch
from torchtext import data

SEED = 15618

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = "spacy")
LABEL = data.LabelField(dtype = torch.long)

In [2]:
import pandas as pd
from pandas import DataFrame
import random

In [3]:
train_file = "Pittsburgh_review.train"
test_file = "Pittsburgh_review.test"

train_data, train_labels = pd.read_csv(train_file)["text"].tolist(), pd.read_csv(train_file)["rating"].tolist()
print(train_data[:1], train_labels[:1])

test_data, test_labels = pd.read_csv(test_file)["text"].tolist(), pd.read_csv(test_file)["rating"].tolist()
print(test_data[:1], test_labels[:1])



["Ahh it was good, but great?! I wouldn't go that far. To be fair, I prefer spicy, bold flavours where this is upscale comfort food.Everything we ordered was extremely rich, and the drinks were excellent.If I could give this spot 3.5 stars I would.Wouldn't go back, but is recommend it if your in town."] [3.0]
["They have good, strong but not burnt iced coffee and very nice vegan baked goods! I've been lucky enough to try their vegan chocolate chip cookie, fluffed nutter cookie, and the walnut banana bread but the original choc chip cookie is my favorite! One of the best vegan cookies I've had. Have stopped by a couple times during both lunch and a bit later in the afternoon and they weren't too packed, although seating is limited. I would consider this a more take-to-go type of lunch spot rather than a sit down coffee shop."] [4.0]


In [4]:
TRAIN_CSV = "train_data.csv"
TEST_CSV = "test_data.csv"

TRAIN_DATA = {"text":train_data,"label":train_labels}
TEST_DATA = {"text":test_data,"label":test_labels}

df_train = DataFrame(TRAIN_DATA,columns = ["text","label"])
df_train.to_csv(TRAIN_CSV)

df_test = DataFrame(TEST_DATA,columns = ["text", "label"])
df_test.to_csv(TEST_CSV)

train_dataset = data.TabularDataset(path = TRAIN_CSV,format = "csv",fields = [("id",None),("text",TEXT),("label",LABEL)],skip_header = True)
test_dataset = data.TabularDataset(path = TEST_CSV,format = "csv",fields = [("id",None),("text",TEXT),("label",LABEL)],skip_header = True)

print("Train\t","Len:\t",len(train_dataset),vars(train_dataset.examples[0]))
print("Test\t" ,"Len:\t",len(test_dataset) ,vars(test_dataset.examples[0]))


Train	 Len:	 60620 {'text': ['Ahh', 'it', 'was', 'good', ',', 'but', 'great', '?', '!', 'I', 'would', "n't", 'go', 'that', 'far', '.', 'To', 'be', 'fair', ',', 'I', 'prefer', 'spicy', ',', 'bold', 'flavours', 'where', 'this', 'is', 'upscale', 'comfort', 'food', '.', 'Everything', 'we', 'ordered', 'was', 'extremely', 'rich', ',', 'and', 'the', 'drinks', 'were', 'excellent', '.', 'If', 'I', 'could', 'give', 'this', 'spot', '3.5', 'stars', 'I', 'would', '.', "Wouldn't", 'go', 'back', ',', 'but', 'is', 'recommend', 'it', 'if', 'your', 'in', 'town', '.'], 'label': '3.0'}
Test	 Len:	 25980 {'text': ['They', 'have', 'good', ',', 'strong', 'but', 'not', 'burnt', 'iced', 'coffee', 'and', 'very', 'nice', 'vegan', 'baked', 'goods', '!', 'I', "'ve", 'been', 'lucky', 'enough', 'to', 'try', 'their', 'vegan', 'chocolate', 'chip', 'cookie', ',', 'fluffed', 'nutter', 'cookie', ',', 'and', 'the', 'walnut', 'banana', 'bread', 'but', 'the', 'original', 'choc', 'chip', 'cookie', 'is', 'my', 'favorite', '!'

In [23]:
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train_dataset,max_size=MAX_VOCAB_SIZE,vectors="glove.6B.200d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_dataset)

print("TEXT:",len(TEXT.vocab),"LABEL:",len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

TEXT: 5002 LABEL: 5
[('.', 497132), ('the', 361237), ('and', 295565), (',', 275940), ('I', 228775), ('a', 226987), ('was', 176786), ('to', 169232), ('of', 127432), ('it', 103924), ('is', 102932), ('for', 95556), ('The', 91197), ('!', 87670), ('in', 83012), ('with', 80289), ('but', 69027), (' ', 65419), ('that', 63509), ('were', 59262)]
['<unk>', '<pad>', '.', 'the', 'and', ',', 'I', 'a', 'was', 'to']
defaultdict(None, {'5.0': 0, '4.0': 1, '3.0': 2, '2.0': 3, '1.0': 4})


In [32]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
        (train_dataset, test_dataset),
        sort_within_batch=True, 
        sort_key=lambda x: len(x.text),
        batch_size=BATCH_SIZE,
        device=device)

In [33]:
import torch.nn as nn

DROP = 0.1
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):       
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)  
        self.rnn = nn.LSTM(embedding_dim,hidden_dim,2,dropout = DROP,bidirectional = True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = torch.nn.Dropout(DROP)
        
    def forward(self, text):
      
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
#         res = self.dropout(torch.sum(hidden[0],0))
#         print(output.size())
        res = self.dropout(torch.mean(output,0))
        return self.fc(res)

In [34]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 256
HIDDEN_DIM = 256
OUTPUT_DIM = 5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [35]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 3,912,709 trainable parameters


In [36]:
import torch.optim as optim
optimizer = torch.optim.Adam(model.parameters())
# optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [11]:
criterion = torch.nn.CrossEntropyLoss()

In [37]:
model = model.to(device)
criterion = criterion.to(device)

In [38]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    values, indices = torch.max(preds, 1)
    res = indices.cpu().detach().numpy()
    correct = (y.cpu().detach().numpy()==res)



    #round predictions to the closest integer
    # rounded_preds = torch.round(torch.sigmoid(preds))
    # correct = (rounded_preds == y).float() #convert into float for division 
    acc = sum(correct) / len(correct)
    return acc


In [39]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator: 
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        # _, preds = torch.max(predictions,1)
        # print(preds.size())
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [40]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            # _, preds = torch.max(predictions,1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [41]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch: 01 | Epoch Time: 1m 13s
	Train Loss: 0.575 | Train Acc: 76.20%
	 Val. Loss: 0.129 |  Val. Acc: 95.81%
Epoch: 02 | Epoch Time: 1m 14s
	Train Loss: 0.073 | Train Acc: 97.90%
	 Val. Loss: 0.018 |  Val. Acc: 99.66%
Epoch: 03 | Epoch Time: 1m 14s
	Train Loss: 0.024 | Train Acc: 99.37%
	 Val. Loss: 0.012 |  Val. Acc: 99.76%
Epoch: 04 | Epoch Time: 1m 14s
	Train Loss: 0.018 | Train Acc: 99.46%
	 Val. Loss: 0.013 |  Val. Acc: 99.72%
Epoch: 05 | Epoch Time: 1m 14s
	Train Loss: 0.012 | Train Acc: 99.63%
	 Val. Loss: 0.011 |  Val. Acc: 99.70%
Epoch: 06 | Epoch Time: 1m 14s
	Train Loss: 0.004 | Train Acc: 99.88%
	 Val. Loss: 0.007 |  Val. Acc: 99.91%
Epoch: 07 | Epoch Time: 1m 14s
	Train Loss: 0.000 | Train Acc: 100.00%
	 Val. Loss: 0.007 |  Val. Acc: 99.92%
Epoch: 08 | Epoch Time: 1m 14s
	Train Loss: 0.000 | Train Acc: 100.00%
	 Val. Loss: 0.008 |  Val. Acc: 99.92%
