In [39]:
import torch
from torchtext import data

SEED = 15618

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = "spacy")
LABEL = data.LabelField(dtype = torch.long)

In [40]:
import pandas as pd
from pandas import DataFrame
import random

In [41]:
train_file = "Pittsburgh_review.train"
test_file = "Pittsburgh_review.test"

train_data, train_labels = pd.read_csv(train_file)["text"].tolist(), pd.read_csv(train_file)["rating"].tolist()
print(train_data[:1], train_labels[:1])

test_data, test_labels = pd.read_csv(test_file)["text"].tolist(), pd.read_csv(test_file)["rating"].tolist()
print(test_data[:1], test_labels[:1])



["It's almost amazing that this place gets busy.  I had a breakfast burrito that came with home fries. Hands down most bland, sub par breakfast burrito I've ever come across.  Eggs were over cooked and without seasoning.  I opted for the up charge of chorizo, there was maybe 1 oz of meat in it.  Not great by any measure. There was tater tots in there also but super soggy, as though they were fried in lukewarm oil.  The homefries that came with it were of the same family.  I only have deep regret I wasted a meal here."] [1.0]
["Honestly, I was going to give Tana 4 stars, but considering it's the only surviving Ethiopian restaurant in Pittsburgh that I can find I figured screw it! They deserve all 5. My boyfriend and I went here on a weeknight, and there was one other couple there. Throughout our meal 2 or three other couples came and went. So it's definitely quiet and empty which can be a little awkward for some, but I loved it & the waiter didn't make it weird. We got sambusa to start,

In [42]:
TRAIN_CSV = "train_data.csv"
TEST_CSV = "test_data.csv"

TRAIN_DATA = {"text":train_data,"label":train_labels}
TEST_DATA = {"text":test_data,"label":test_labels}

df_train = DataFrame(TRAIN_DATA,columns = ["text","label"])
df_train.to_csv(TRAIN_CSV)

df_test = DataFrame(TEST_DATA,columns = ["text", "label"])
df_test.to_csv(TEST_CSV)

train_dataset = data.TabularDataset(path = TRAIN_CSV,format = "csv",fields = [("id",None),("text",TEXT),("label",LABEL)],skip_header = True)
test_dataset = data.TabularDataset(path = TEST_CSV,format = "csv",fields = [("id",None),("text",TEXT),("label",LABEL)],skip_header = True)

print("Train\t","Len:\t",len(train_dataset),vars(train_dataset.examples[0]))
print("Test\t" ,"Len:\t",len(test_dataset) ,vars(test_dataset.examples[0]))


Train	 Len:	 21952 {'text': ['It', "'s", 'almost', 'amazing', 'that', 'this', 'place', 'gets', 'busy', '.', ' ', 'I', 'had', 'a', 'breakfast', 'burrito', 'that', 'came', 'with', 'home', 'fries', '.', 'Hands', 'down', 'most', 'bland', ',', 'sub', 'par', 'breakfast', 'burrito', 'I', "'ve", 'ever', 'come', 'across', '.', ' ', 'Eggs', 'were', 'over', 'cooked', 'and', 'without', 'seasoning', '.', ' ', 'I', 'opted', 'for', 'the', 'up', 'charge', 'of', 'chorizo', ',', 'there', 'was', 'maybe', '1', 'oz', 'of', 'meat', 'in', 'it', '.', ' ', 'Not', 'great', 'by', 'any', 'measure', '.', 'There', 'was', 'tater', 'tots', 'in', 'there', 'also', 'but', 'super', 'soggy', ',', 'as', 'though', 'they', 'were', 'fried', 'in', 'lukewarm', 'oil', '.', ' ', 'The', 'homefries', 'that', 'came', 'with', 'it', 'were', 'of', 'the', 'same', 'family', '.', ' ', 'I', 'only', 'have', 'deep', 'regret', 'I', 'wasted', 'a', 'meal', 'here', '.'], 'label': '1.0'}
Test	 Len:	 9408 {'text': ['Honestly', ',', 'I', 'was', 'go

In [43]:
MAX_VOCAB_SIZE = 20_000

TEXT.build_vocab(train_dataset,max_size=MAX_VOCAB_SIZE,vectors="glove.6B.200d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_dataset)

print("TEXT:",len(TEXT.vocab),"LABEL:",len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

TEXT: 12827 LABEL: 5
[('.', 186417), ('the', 137834), ('and', 111112), (',', 102669), ('I', 86275), ('a', 85040), ('was', 68677), ('to', 63675), ('of', 46897), ('it', 39001), ('is', 36811), ('for', 36459), ('The', 34550), ('!', 33821), ('with', 30665), ('in', 30167), ('but', 26525), ('that', 24240), (' ', 24127), ('were', 23631)]
['<unk>', '<pad>', '.', 'the', 'and', ',', 'I', 'a', 'was', 'to']
defaultdict(None, {'5.0': 0, '4.0': 1, '3.0': 2, '2.0': 3, '1.0': 4})


In [44]:
BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
        (train_dataset, test_dataset),
        sort_within_batch=True, 
        sort_key=lambda x: len(x.text),
        batch_size=BATCH_SIZE,
        device=device)

In [45]:
import torch.nn as nn

DROP = 0.1
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):       
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)  
        self.rnn = nn.LSTM(embedding_dim,hidden_dim,2,dropout = DROP,bidirectional = True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = torch.nn.Dropout(DROP)
        
    def forward(self, text):
      
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
#         res = self.dropout(torch.sum(hidden[0],0))
#         print(output.size())
        res = self.dropout(torch.mean(output,0))
        return self.fc(res)

In [46]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 512
HIDDEN_DIM = 512
OUTPUT_DIM = 5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [47]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 17,074,693 trainable parameters


In [48]:
import torch.optim as optim
optimizer = torch.optim.Adam(model.parameters())
# optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [49]:
criterion = torch.nn.CrossEntropyLoss()

In [50]:
model = model.to(device)
criterion = criterion.to(device)

In [66]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    values, indices = torch.max(preds, 1)
    res = indices.cpu().detach().numpy()
    correct = (y.cpu().detach().numpy()==res)



    #round predictions to the closest integer
    # rounded_preds = torch.round(torch.sigmoid(preds))
    # correct = (rounded_preds == y).float() #convert into float for division 
    acc = sum(correct) / len(correct)
    return acc


In [67]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator: 
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        # _, preds = torch.max(predictions,1)
        # print(preds.size())
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [68]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            # _, preds = torch.max(predictions,1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [69]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [70]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


AttributeError: 'numpy.ndarray' object has no attribute 'float'