In [1]:
import torch
from torchtext import data
import torch.nn as nn
import pandas as pd

In [None]:
# pip install torch==1.6 torchtext==0.7

In [2]:
# Create reproducible results

SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [6]:
import pandas as pd

dataset = pd.read_pickle('dataset_split.pkl')

In [7]:
dataset['X_train']

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
1,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1.0,1,1,0,I am actually horrified this place is still in...,2013-12-07 03:16:52
2,LG2ZaYiOgpr2DK_90pYjNw,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5.0,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11
4,6TdNDKywdbjoTkizeMce8A,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,4.0,0,0,0,"Oh happy day, finally have a Canes near my cas...",2017-01-14 21:56:57
5,L2O_INwlrRuoX05KSjc4eg,5vD2kmE25YBrbayKhykNxQ,nlxHRv1zXGT0c0K51q3jDg,5.0,2,0,0,This is definitely my favorite fast food sub s...,2013-05-07 07:25:25
6,ZayJ1zWyWgY9S_TRLT_y9Q,aq_ZxGHiri48TUXJlpRkCQ,Pthe4qk5xh4n-ef-9bvMSg,5.0,1,0,0,"Really good place with simple decor, amazing f...",2015-11-05 23:11:05
...,...,...,...,...,...,...,...,...,...
8021116,43ugcDASS-mGv0eYozge_g,kyxGYZpa4UNmA7Q0gmQmYA,H85um1dDQHAeUJ6AqYIqww,1.0,0,0,0,I'm still reminded of my move every time I see...,2019-12-10 12:07:55
8021117,LAzw2u1ucY722ryLEXHdgg,6DMFD3BRp-MVzDQelRx5UQ,XW2kaXdahICaJ27A0dhGHg,1.0,1,0,1,"Fricken unbelievable, I ordered 2 space heater...",2019-12-11 01:07:06
8021118,gMDU14Fa_DVIcPvsKtubJA,_g6P8H3-qfbz1FxbffS68g,IsoLzudHC50oJLiEWpwV-w,3.0,1,3,1,Solid American food with a southern comfort fl...,2019-12-10 04:15:00
8021119,EcY_p50zPIQ2R6rf6-5CjA,Scmyz7MK4TbXXYcaLZxIxQ,kDCyqlYcstqnoqnfBRS5Og,5.0,15,6,13,I'm honestly not sure how I have never been to...,2019-06-06 15:01:53


In [8]:
train_df = dataset['X_train'].loc[:, ['text', 'stars']][0:100000]
train_df['stars'] = (train_df['stars'] >= 3.0).astype(int)
train_df.columns = ['text', 'target']

val_df = dataset['X_val'].loc[:, ['text', 'stars']][0:50000]
val_df['stars'] = (val_df['stars'] >= 3.0).astype(int)
val_df.columns = ['text', 'target']

In [9]:
len(val_df)

50000

In [10]:
train_df.head()

Unnamed: 0,text,target
1,I am actually horrified this place is still in...,0
2,I love Deagan's. I do. I really do. The atmosp...,1
4,"Oh happy day, finally have a Canes near my cas...",1
5,This is definitely my favorite fast food sub s...,1
6,"Really good place with simple decor, amazing f...",1


In [11]:
# source : https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8
# to use DataFrame as a Data source

class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [12]:
fields = [('text', TEXT), ('label', LABEL)]

train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=val_df)



In [242]:
# test_ds = DataFrameDataset.splits(fields, train_df=None, test_df=val_df[0:5])[0]

In [13]:
# test_ds

In [14]:
# Random example
print(vars(train_ds[15]))

{'text': ['This', 'was', 'our', 'choice', ',', 'by', 'default', ',', 'for', 'New', 'Year', "'s", 'Eve', 'dinner', 'this', 'year', '.', 'See', ',', 'we', 'chose', 'not', 'to', 'make', 'reservations', ',', 'but', 'instead', 'just', 'wing', 'it', 'and', 'see', 'where', 'we', 'could', 'get', 'in', '.', 'They', 'had', 'some', 'early', 'dining', 'availability', ',', 'so', 'we', 'jumped', 'on', 'it', '.', 'Their', 'special', 'New', 'Year', "'s", '"', 'Paint', 'the', 'Town', 'Gold', '"', 'menu', 'did', 'not', 'disappoint', '.', '\n\n', 'Located', 'in', 'the', 'beautiful', 'Mandalay', 'Bay', 'restaurant', 'row', ',', 'Kumi', 'is', 'a', 'gorgeous', 'display', 'of', 'style', 'and', 'food', 'artistry', '.', 'Every', 'course', 'was', 'surprisingly', 'delicious', ',', 'and', 'plated', 'interestingly', '.', '\n\n', 'Service', 'was', 'very', 'good', ',', 'seriously', 'on', 'point', '.', 'Right', 'up', 'until', 'the', 'dessert', 'course', '...', 'after', '30', 'minutes', 'of', 'waiting', 'for', 'it', '

In [15]:
# Random example
# print(vars(test_ds[0]))

NameError: name 'test_ds' is not defined

In [16]:
# Build vocabulary
# MAX_VOCAB_SIZE = 25000
MAX_VOCAB_SIZE = 10000

TEXT.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.200d',
                 unk_init = torch.Tensor.zero_)

In [17]:
LABEL.build_vocab(train_ds)

In [19]:
# Build iterators
BATCH_SIZE = 128

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [112]:
# device = torch.device('cpu')

In [21]:
# Declare hyperparameters
num_epochs = 25
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # padding

In [22]:
INPUT_DIM

10002

In [49]:
    # LSTM Class

    class LSTM_net(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                     bidirectional, dropout, pad_idx):

            super().__init__()

            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

            self.rnn = nn.LSTM(embedding_dim, 
                               hidden_dim, 
                               num_layers=n_layers, 
                               bidirectional=bidirectional, 
                               dropout=dropout,
                              batch_first=True)

            self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)

            self.fc2 = nn.Linear(hidden_dim, 1)

            self.dropout = nn.Dropout(dropout)

        def forward(self, text, text_lengths):

            # text = [sent len, batch size]

            embedded = self.embedding(text)

            # embedded = [sent len, batch size, emb dim]

            #pack sequence
            packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)

            packed_output, (hidden, cell) = self.rnn(packed_embedded)

            #unpack sequence
            # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

            # output = [sent len, batch size, hid dim * num directions]
            # output over padding tokens are zero tensors

            # hidden = [num layers * num directions, batch size, hid dim]
            # cell = [num layers * num directions, batch size, hid dim]

            # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
            # and apply dropout

            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
            output = self.fc1(hidden)
            output = self.dropout(self.fc2(output))

            #hidden = [batch size, hid dim * num directions]

            return output

In [50]:
#creating instance of our LSTM_net class

model = LSTM_net(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [51]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

torch.Size([10002, 200])


In [52]:
model.to(device) #CNN to GPU

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [53]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [54]:
# Training func
def train(model, iterator):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        text, text_lengths = batch.text
        
        optimizer.zero_grad()
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator):
    
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_acc += acc.item()
        
    return epoch_acc / len(iterator)


In [55]:
import time

t = time.time()
loss=[]
acc=[]
val_acc=[]

num_epochs = 1
for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iterator)
    valid_acc = evaluate(model, valid_iterator)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)
    
print(f'time:{time.time()-t:.3f}')

RuntimeError: Expected `len(lengths)` to be equal to batch_size, but got 128 (batch_size=80)

In [272]:
with torch.no_grad():
    for batch in test_iterator[0]:
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)

KeyError: None

In [46]:
#inference 
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()            

In [47]:
model.eval()

LSTM_net(
  (embedding): Embedding(10002, 200, padding_idx=1)
  (rnn): LSTM(200, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [56]:
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [61]:
predict(model, "terrible horrible restaurant")

0.0927320122718811