In [1]:
import torch
from torchtext import data
import torch.nn as nn
import pandas as pd

In [None]:
# pip install torch==1.6 torchtext==0.7

In [2]:
# Create reproducible results

SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Create text and label fields
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

# Load dataset



In [3]:
import pandas as pd

dataset = pd.read_pickle('dataset_split.pkl')

In [4]:
dataset['X_train']

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
1,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1.0,1,1,0,I am actually horrified this place is still in...,2013-12-07 03:16:52
2,LG2ZaYiOgpr2DK_90pYjNw,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5.0,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11
4,6TdNDKywdbjoTkizeMce8A,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,4.0,0,0,0,"Oh happy day, finally have a Canes near my cas...",2017-01-14 21:56:57
5,L2O_INwlrRuoX05KSjc4eg,5vD2kmE25YBrbayKhykNxQ,nlxHRv1zXGT0c0K51q3jDg,5.0,2,0,0,This is definitely my favorite fast food sub s...,2013-05-07 07:25:25
6,ZayJ1zWyWgY9S_TRLT_y9Q,aq_ZxGHiri48TUXJlpRkCQ,Pthe4qk5xh4n-ef-9bvMSg,5.0,1,0,0,"Really good place with simple decor, amazing f...",2015-11-05 23:11:05
...,...,...,...,...,...,...,...,...,...
8021116,43ugcDASS-mGv0eYozge_g,kyxGYZpa4UNmA7Q0gmQmYA,H85um1dDQHAeUJ6AqYIqww,1.0,0,0,0,I'm still reminded of my move every time I see...,2019-12-10 12:07:55
8021117,LAzw2u1ucY722ryLEXHdgg,6DMFD3BRp-MVzDQelRx5UQ,XW2kaXdahICaJ27A0dhGHg,1.0,1,0,1,"Fricken unbelievable, I ordered 2 space heater...",2019-12-11 01:07:06
8021118,gMDU14Fa_DVIcPvsKtubJA,_g6P8H3-qfbz1FxbffS68g,IsoLzudHC50oJLiEWpwV-w,3.0,1,3,1,Solid American food with a southern comfort fl...,2019-12-10 04:15:00
8021119,EcY_p50zPIQ2R6rf6-5CjA,Scmyz7MK4TbXXYcaLZxIxQ,kDCyqlYcstqnoqnfBRS5Og,5.0,15,6,13,I'm honestly not sure how I have never been to...,2019-06-06 15:01:53


In [5]:
train_df = dataset['X_train'].loc[:, ['text', 'stars']][0:100000]
train_df['stars'] = (train_df['stars'] >= 3.0).astype(int)
train_df.columns = ['text', 'target']

val_df = dataset['X_val'].loc[:, ['text', 'stars']][0:50000]
val_df['stars'] = (val_df['stars'] >= 3.0).astype(int)
val_df.columns = ['text', 'target']

In [6]:
len(val_df)

50000

In [7]:
train_df.head()

Unnamed: 0,text,target
1,I am actually horrified this place is still in...,0
2,I love Deagan's. I do. I really do. The atmosp...,1
4,"Oh happy day, finally have a Canes near my cas...",1
5,This is definitely my favorite fast food sub s...,1
6,"Really good place with simple decor, amazing f...",1


In [8]:
import os
os.chdir('..')
os.getcwd()

'/home/paulo/Yelp_Dataset'

In [9]:
from src.data.utils import DataFrameDataset

In [11]:
fields = [('text', TEXT), ('label', LABEL)]

train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=val_df)

In [None]:
# Random example
print(vars(train_ds[15]))

In [None]:
# Random example
# print(vars(test_ds[0]))

In [12]:
# Build vocabulary
# MAX_VOCAB_SIZE = 25000
MAX_VOCAB_SIZE = 10000

TEXT.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.200d',
                 unk_init = torch.Tensor.zero_)

In [13]:
LABEL.build_vocab(train_ds)

In [37]:
# Build iterators
BATCH_SIZE = 128

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)



In [38]:
from torch.utils.data import DataLoader

# train_it = DataLoader(train_ds, batch_size = BATCH_SIZE)

In [39]:
device

device(type='cuda', index=2)

In [40]:
# device = torch.device('cpu')

In [41]:
# Declare hyperparameters
num_epochs = 25
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # padding

In [42]:
INPUT_DIM

10002

In [43]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
from models.lstm import LSTM_net

In [45]:
#creating instance of our LSTM_net class

model = LSTM_net(INPUT_DIM,
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM,
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT,
            PAD_IDX)

In [46]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

torch.Size([10002, 200])


In [47]:
# model.to(device) #CNN to GPU

# Loss and optimizer
# criterion = nn.BCEWithLogitsLoss()

# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [48]:
import mlflow.pytorch
from mlflow.tracking import MlflowClient

In [49]:
import pytorch_lightning as pl
trainer = pl.Trainer(gpus=1, max_epochs=20, progress_bar_refresh_rate=20)

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: False, using: 0 TPU cores
INFO:lightning:TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


In [51]:
# Auto log all MLflow entities
mlflow.pytorch.autolog()

# Train the model
with mlflow.start_run() as run:
    trainer.fit(model, train_iterator, valid_iterator)

2021/01/24 16:47:36 INFO mlflow.utils.autologging_utils: pytorch autologging will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow to the MLflow run with ID '79cfc47f3c0345c3a71a2baa3db7e82a'

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 2 M   
1 | rnn       | LSTM      | 733 K 
2 | fc1       | Linear    | 32 K  
3 | fc2       | Linear    | 129   
4 | dropout   | Dropout   | 0     
INFO:lightning:
  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 2 M   
1 | rnn       | LSTM      | 733 K 
2 | fc1       | Linear    | 32 K  
3 | fc2       | Linear    | 129   
4 | dropout   | Dropout   | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

ValueError: cannot convert float NaN to integer

In [None]:
def print_auto_logged_info(r):

    tags = {k: v for k, v in r.data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in MlflowClient().list_artifacts(r.info.run_id, "model")]
    print("run_id: {}".format(r.info.run_id))
    print("artifacts: {}".format(artifacts))
    print("params: {}".format(r.data.params))
    print("metrics: {}".format(r.data.metrics))
    print("tags: {}".format(tags))
    
# fetch the auto logged parameters and metrics
print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))

## Train Model

In [None]:
import time

t = time.time()
loss=[]
acc=[]
val_acc=[]
best_valid_loss = float('inf')

num_epochs = 5
for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iterator)
    valid_loss, valid_acc = evaluate(model, valid_iterator)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)
    
print(f'time:{time.time()-t:.3f}')

In [None]:
import dill 

with open("TEXT.Field", "wb") as f:
     dill.dump(TEXT, f)

In [53]:
#inference 
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()            

In [54]:
model.eval()

LSTM_net(
  (embedding): Embedding(10002, 200, padding_idx=1)
  (rnn): LSTM(200, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [55]:
device = torch.device('cpu')
model = model.to(device)

In [56]:
model

LSTM_net(
  (embedding): Embedding(10002, 200, padding_idx=1)
  (rnn): LSTM(200, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [57]:
predict(model, "terrible horrible restaurant")

RuntimeError: Expected `len(lengths)` to be equal to batch_size, but got 1 (batch_size=3)