In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 1

In [2]:

train_dataset, test_dataset = text_classification.DATASETS['YelpReviewPolarity'](
    root='../data', ngrams=NGRAMS, vocab=None)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


560000lines [00:54, 10285.42lines/s]
560000lines [01:45, 5321.36lines/s]
38000lines [00:07, 4852.63lines/s]


In [3]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [4]:
BATCH_SIZE = 32

from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)

In [5]:
import os
os.chdir('..')
os.getcwd()

'/home/paulo/Yelp_Dataset'

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
from src.average_embedding import TextSentiment

In [8]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS)

In [9]:
NUN_CLASS

2

In [10]:
import mlflow.pytorch
from mlflow.tracking import MlflowClient
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
import torch
import pytorch_lightning as pl

In [11]:
early_stopping = EarlyStopping(monitor="val_loss", mode="min", verbose=True, patience=3)
checkpoint_callback = ModelCheckpoint(
    filepath=os.getcwd(), save_top_k=1, verbose=True, monitor="val_loss", mode="min", prefix="",
)
lr_logger = LearningRateMonitor()
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

# Loss and optimizer
trainer = pl.Trainer(gpus=1, max_epochs=1, progress_bar_refresh_rate=20)
                  #   callbacks=[lr_logger, early_stopping],
                   #  checkpoint_callback=True)

# Auto log all MLflow entities
mlflow.pytorch.autolog(log_models = True)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: None, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.accelerator_connector:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


In [12]:
model

TextSentiment(
  (embedding): EmbeddingBag(464400, 32, mode=mean)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

In [14]:
# Train the model
with mlflow.start_run() as run:
    trainer.fit(model, train_dataloader, test_dataloader)
    #mlflow.pytorch.log_model(model, "model")
#    mlflow.log_artifact(model)

2021/02/01 22:26:28 INFO mlflow.utils.autologging_utils: pytorch autologging will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow to the MLflow run with ID '79eda37c5f36449bb96d6591b36f3a87'
INFO:pytorch_lightning.core.lightning:
  | Name      | Type         | Params
-------------------------------------------
0 | embedding | EmbeddingBag | 14.9 M
1 | fc        | Linear       | 66    
-------------------------------------------
14.9 M    Trainable params
0         Non-trainable params
14.9 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [20]:
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

predict(ex_text_str, model, vocab, 1)

0