In [1]:
import pandas as pd
import urllib.request
%matplotlib inline
import matplotlib.pyplot as plt
import re
import random
import time
import datetime
import numpy as np
import os
import tqdm
%config InlineBackend.figure_format = 'retina'
import shutil
import logging
logging.basicConfig(level=logging.ERROR)

    # # tf
    # from tensorflow.keras.datasets import reuters
    # from tensorflow.keras.models import Sequential
    # from tensorflow.keras.layers import Dense, LSTM, Embedding
    # from tensorflow.keras.preprocessing.sequence import pad_sequences
    # from tensorflow.keras.utils import to_categorical
    # from tensorflow.keras.models import load_model
    # from tensorflow.keras.preprocessing.text import Tokenizer

#torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

# transformers(BERT)
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelWithLMHead, XLMRobertaForSequenceClassification

In [2]:
from adamp import AdamP

In [3]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
bert = AutoModelWithLMHead.from_pretrained("xlm-roberta-large")

In [4]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 2 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-32GB


In [5]:
train = pd.read_csv("train.csv", usecols=['category','data'])[['data','category']].dropna()
test = pd.read_csv('test.csv', encoding = 'utf-8')

In [6]:
train_sentences = train.data.values
train_labels = train.category.values

train_input_ids = []
train_attention_masks = []

for sent in train_sentences:
    encoded_dict = tokenizer.encode_plus(
                sent,
                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                max_length = 512,           # Pad & truncate all sentences.
                pad_to_max_length = True,
                return_attention_mask = True,   # Construct attn. masks.
                return_tensors = 'pt')  
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])

train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_labels)

In [7]:
test_sentences = test.data.values

test_input_ids = []
test_attention_masks = []

for sent in test_sentences:
    encoded_dict = tokenizer.encode_plus(
                sent,
                add_special_tokens = True, 
                max_length = 512,           
                pad_to_max_length = True,
                return_attention_mask = True,  
                return_tensors = 'pt')   
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

In [8]:
dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
prediction_data = TensorDataset(test_input_ids, test_attention_masks)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(len(prediction_data)))

35,992 training samples
4,000 validation samples
5,000 test samples


In [9]:
train_batch_size = 20
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = train_batch_size # Trains with this batch size.
        )
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = train_batch_size # Evaluate with this batch size.
        )

test_batch_size = 20
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=test_batch_size)

In [10]:
model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-large', # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'
model = nn.DataParallel(model, output_device=1)
model.cuda()

DataParallel(
  (module): XLMRobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in

In [11]:
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 395 different named parameters.

==== Embedding Layer ====

module.roberta.embeddings.word_embeddings.weight        (250002, 1024)
module.roberta.embeddings.position_embeddings.weight     (514, 1024)
module.roberta.embeddings.token_type_embeddings.weight     (1, 1024)
module.roberta.embeddings.LayerNorm.weight                   (1024,)
module.roberta.embeddings.LayerNorm.bias                     (1024,)

==== First Transformer ====

module.roberta.encoder.layer.0.attention.self.query.weight (1024, 1024)
module.roberta.encoder.layer.0.attention.self.query.bias      (1024,)
module.roberta.encoder.layer.0.attention.self.key.weight (1024, 1024)
module.roberta.encoder.layer.0.attention.self.key.bias       (1024,)
module.roberta.encoder.layer.0.attention.self.value.weight (1024, 1024)
module.roberta.encoder.layer.0.attention.self.value.bias      (1024,)
module.roberta.encoder.layer.0.attention.output.dense.weight (1024, 1024)
module.roberta.encoder.layer.0.attention.output

In [12]:
optimizer = AdamP(model.parameters(),
                  lr = 5e-6, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  betas=(0.9, 0.999), weight_decay=1e-2 # args.adam_epsilon  - default is 1e-8.
                )
# Number of training epochs. The BERT authors recommend between 2 and 4. 
epochs = 6

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [13]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [14]:
%tensorboard --logdir 'logs'

Reusing TensorBoard on port 6006 (pid 5796), started 3:11:53 ago. (Use '!kill 5796' to kill it.)

In [15]:
seed_val = 0

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
valid_loss_min = np.inf
total_t0 = time.time()
writer = SummaryWriter('logs')

for epoch_i in tqdm.notebook.tqdm(range(0, epochs)):
    #               Training

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0

    model.train()

    for step, batch in tqdm.notebook.tqdm(enumerate(train_dataloader)):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)


        model.zero_grad()        

        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        total_train_loss += sum(loss).item()
        
        writer.add_scalar('training loss',
                  total_train_loss/50,
                  epoch_i * len(train_dataloader) + 1)
        
        sum(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        scheduler.step() # Update the learning rate.

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        

    # Validation

    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in tqdm.notebook.tqdm(validation_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)

        total_eval_loss += sum(loss).item()   

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
        writer.add_scalar('validation loss',
                          total_eval_loss,
                          epoch_i * len(train_dataloader) + 1)
        writer.add_scalar('validation accuracy',
                          total_eval_accuracy,
                          epoch_i * len(train_dataloader) + 1)
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid_Loss': avg_val_loss,
            'Valid_Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    # test
    print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))

    model.eval()
    predictions = []

    for batch in tqdm.notebook.tqdm(prediction_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,  # Telling the model not to compute or store gradients, saving memory and 
                            attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    print('DONE.')

    flat_predictions = np.concatenate(predictions, axis=0)
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

    submission = pd.DataFrame(flat_predictions)
    submission.columns = ['category']
    submission.index.name = 'index'
    submission.to_csv('result/XLM_RoBERTa_Batch20(multi)_Maxlen512_5e-6(adamp)_{}.csv'.format(epoch_i+1), encoding='utf-8')

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


  Batch    50  of  1,800.    Elapsed: 0:01:14.
  Batch   100  of  1,800.    Elapsed: 0:02:22.
  Batch   150  of  1,800.    Elapsed: 0:03:31.
  Batch   200  of  1,800.    Elapsed: 0:04:40.
  Batch   250  of  1,800.    Elapsed: 0:05:49.
  Batch   300  of  1,800.    Elapsed: 0:06:58.
  Batch   350  of  1,800.    Elapsed: 0:08:07.
  Batch   400  of  1,800.    Elapsed: 0:09:16.
  Batch   450  of  1,800.    Elapsed: 0:10:24.
  Batch   500  of  1,800.    Elapsed: 0:11:33.
  Batch   550  of  1,800.    Elapsed: 0:12:42.
  Batch   600  of  1,800.    Elapsed: 0:13:51.
  Batch   650  of  1,800.    Elapsed: 0:15:00.
  Batch   700  of  1,800.    Elapsed: 0:16:09.
  Batch   750  of  1,800.    Elapsed: 0:17:18.
  Batch   800  of  1,800.    Elapsed: 0:18:27.
  Batch   850  of  1,800.    Elapsed: 0:19:36.
  Batch   900  of  1,800.    Elapsed: 0:20:45.
  Batch   950  of  1,800.    Elapsed: 0:21:54.
  Batch 1,000  of  1,800.    Elapsed: 0:23:03.
  Batch 1,050  of  1,800.    Elapsed: 0:24:12.
  Batch 1,100

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


  Accuracy: 0.89
  Validation Loss: 0.58
  Validation took: 0:01:21
Predicting labels for 5,000 test sentences...


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


DONE.

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  Batch    50  of  1,800.    Elapsed: 0:01:09.
  Batch   100  of  1,800.    Elapsed: 0:02:18.
  Batch   150  of  1,800.    Elapsed: 0:03:27.
  Batch   200  of  1,800.    Elapsed: 0:04:36.
  Batch   250  of  1,800.    Elapsed: 0:05:44.
  Batch   300  of  1,800.    Elapsed: 0:06:53.
  Batch   350  of  1,800.    Elapsed: 0:08:02.
  Batch   400  of  1,800.    Elapsed: 0:09:11.
  Batch   450  of  1,800.    Elapsed: 0:10:20.
  Batch   500  of  1,800.    Elapsed: 0:11:29.
  Batch   550  of  1,800.    Elapsed: 0:12:37.
  Batch   600  of  1,800.    Elapsed: 0:13:46.
  Batch   650  of  1,800.    Elapsed: 0:14:55.
  Batch   700  of  1,800.    Elapsed: 0:16:04.
  Batch   750  of  1,800.    Elapsed: 0:17:13.
  Batch   800  of  1,800.    Elapsed: 0:18:21.
  Batch   850  of  1,800.    Elapsed: 0:19:30.
  Batch   900  of  1,800.    Elapsed: 0:20:39.
  Batch   950  of  1,800.    Elapsed: 0:21:48.
  Batch 1,000  of  1,800.    Elapsed: 0:22:57.
  Batch 1,050  of  1,800.    Elapsed: 0:24:07.
  Batch 1,100

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


  Accuracy: 0.89
  Validation Loss: 0.58
  Validation took: 0:01:21
Predicting labels for 5,000 test sentences...


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


DONE.

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  Batch    50  of  1,800.    Elapsed: 0:01:09.
  Batch   100  of  1,800.    Elapsed: 0:02:18.
  Batch   150  of  1,800.    Elapsed: 0:03:26.
  Batch   200  of  1,800.    Elapsed: 0:04:35.
  Batch   250  of  1,800.    Elapsed: 0:05:44.
  Batch   300  of  1,800.    Elapsed: 0:06:53.
  Batch   350  of  1,800.    Elapsed: 0:08:02.
  Batch   400  of  1,800.    Elapsed: 0:09:11.
  Batch   450  of  1,800.    Elapsed: 0:10:19.
  Batch   500  of  1,800.    Elapsed: 0:11:28.
  Batch   550  of  1,800.    Elapsed: 0:12:37.
  Batch   600  of  1,800.    Elapsed: 0:13:46.
  Batch   650  of  1,800.    Elapsed: 0:14:54.
  Batch   700  of  1,800.    Elapsed: 0:16:03.
  Batch   750  of  1,800.    Elapsed: 0:17:12.
  Batch   800  of  1,800.    Elapsed: 0:18:21.
  Batch   850  of  1,800.    Elapsed: 0:19:31.
  Batch   900  of  1,800.    Elapsed: 0:20:39.
  Batch   950  of  1,800.    Elapsed: 0:21:48.
  Batch 1,000  of  1,800.    Elapsed: 0:22:57.
  Batch 1,050  of  1,800.    Elapsed: 0:24:06.
  Batch 1,100

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


  Accuracy: 0.89
  Validation Loss: 0.65
  Validation took: 0:01:21
Predicting labels for 5,000 test sentences...


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

  Batch   350  of  1,800.    Elapsed: 0:08:02.
  Batch   400  of  1,800.    Elapsed: 0:09:11.
  Batch   450  of  1,800.    Elapsed: 0:10:20.
  Batch   500  of  1,800.    Elapsed: 0:11:29.
  Batch   550  of  1,800.    Elapsed: 0:12:38.
  Batch   600  of  1,800.    Elapsed: 0:13:46.
  Batch   650  of  1,800.    Elapsed: 0:14:55.
  Batch   700  of  1,800.    Elapsed: 0:16:04.
  Batch   750  of  1,800.    Elapsed: 0:17:12.
  Batch   800  of  1,800.    Elapsed: 0:18:21.
  Batch   850  of  1,800.    Elapsed: 0:19:30.
  Batch   900  of  1,800.    Elapsed: 0:20:39.
  Batch   950  of  1,800.    Elapsed: 0:21:48.
  Batch 1,000  of  1,800.    Elapsed: 0:22:57.
  Batch 1,050  of  1,800.    Elapsed: 0:24:05.
  Batch 1,100  of  1,800.    Elapsed: 0:25:14.
  Batch 1,150  of  1,800.    Elapsed: 0:26:23.
  Batch 1,200  of  1,800.    Elapsed: 0:27:32.
  Batch 1,250  of  1,800.    Elapsed: 0:28:41.
  Batch 1,300  of  1,800.    Elapsed: 0:29:50.
  Batch 1,350  of  1,800.    Elapsed: 0:30:58.
  Batch 1,400

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


  Accuracy: 0.90
  Validation Loss: 0.65
  Validation took: 0:01:21
Predicting labels for 5,000 test sentences...


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


DONE.

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  Batch    50  of  1,800.    Elapsed: 0:01:09.
  Batch   100  of  1,800.    Elapsed: 0:02:18.
  Batch   150  of  1,800.    Elapsed: 0:03:27.
  Batch   200  of  1,800.    Elapsed: 0:04:36.
  Batch   250  of  1,800.    Elapsed: 0:05:45.
  Batch   300  of  1,800.    Elapsed: 0:06:53.
  Batch   350  of  1,800.    Elapsed: 0:08:02.
  Batch   400  of  1,800.    Elapsed: 0:09:11.
  Batch   450  of  1,800.    Elapsed: 0:10:20.
  Batch   500  of  1,800.    Elapsed: 0:11:29.
  Batch   550  of  1,800.    Elapsed: 0:12:38.
  Batch   600  of  1,800.    Elapsed: 0:13:47.
  Batch   650  of  1,800.    Elapsed: 0:14:55.
  Batch   700  of  1,800.    Elapsed: 0:16:04.
  Batch   750  of  1,800.    Elapsed: 0:17:13.
  Batch   800  of  1,800.    Elapsed: 0:18:22.
  Batch   850  of  1,800.    Elapsed: 0:19:30.
  Batch   900  of  1,800.    Elapsed: 0:20:39.
  Batch   950  of  1,800.    Elapsed: 0:21:48.
  Batch 1,000  of  1,800.    Elapsed: 0:22:57.
  Batch 1,050  of  1,800.    Elapsed: 0:24:06.
  Batch 1,100

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


  Accuracy: 0.89
  Validation Loss: 0.76
  Validation took: 0:01:21
Predicting labels for 5,000 test sentences...


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


DONE.

Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  Batch    50  of  1,800.    Elapsed: 0:01:09.
  Batch   100  of  1,800.    Elapsed: 0:02:18.
  Batch   150  of  1,800.    Elapsed: 0:03:27.
  Batch   200  of  1,800.    Elapsed: 0:04:36.
  Batch   250  of  1,800.    Elapsed: 0:05:45.
  Batch   300  of  1,800.    Elapsed: 0:06:54.
  Batch   350  of  1,800.    Elapsed: 0:08:03.
  Batch   400  of  1,800.    Elapsed: 0:09:12.
  Batch   450  of  1,800.    Elapsed: 0:10:21.
  Batch   500  of  1,800.    Elapsed: 0:11:29.
  Batch   550  of  1,800.    Elapsed: 0:12:38.
  Batch   600  of  1,800.    Elapsed: 0:13:47.
  Batch   650  of  1,800.    Elapsed: 0:14:56.
  Batch   700  of  1,800.    Elapsed: 0:16:04.
  Batch   750  of  1,800.    Elapsed: 0:17:14.
  Batch   800  of  1,800.    Elapsed: 0:18:22.
  Batch   850  of  1,800.    Elapsed: 0:19:31.
  Batch   900  of  1,800.    Elapsed: 0:20:41.
  Batch   950  of  1,800.    Elapsed: 0:21:50.
  Batch 1,000  of  1,800.    Elapsed: 0:22:58.
  Batch 1,050  of  1,800.    Elapsed: 0:24:07.
  Batch 1,100

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


  Accuracy: 0.89
  Validation Loss: 0.80
  Validation took: 0:01:21
Predicting labels for 5,000 test sentences...


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


DONE.

