## Inference Pipeline

For inference we instantiate the same neural modules but now we will be using the checkpoints that we just learned.

In [None]:
from nemo.core import NeuralModuleFactory
from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset
from nemo.collections.nlp.nm.data_layers.text_classification_datalayer import BertTextClassificationDataLayer
from nemo.collections.nlp.nm.trainables.common.huggingface.bert_nm import BERT
from nemo.collections.nlp.nm.trainables.common.sequence_classification_nm import SequenceClassifier
from pytorch_transformers import BertTokenizer
import torch.nn.functional as f
import torch

import json
import math
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = -1

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
pretrained_bert_model = 'bert-base-uncased'
#pretrained_bert_model = 'bert-large-uncased'

log_dir = 'logs/' + pretrained_bert_model
checkpoint_dir = log_dir + '/checkpoints'
bert_model_config_path = log_dir + '/' + pretrained_bert_model + '_config.json'
inference_log_dir = log_dir + '/inference'
data_dir = 'data/SST-2/split'

In [None]:
!ls -lh $checkpoint_dir

In [None]:
nf = NeuralModuleFactory(log_dir=inference_log_dir,
                                   optimization_level='O1')

tokenizer = BertTokenizer.from_pretrained(pretrained_bert_model)

bert = BERT(config_filename=bert_model_config_path)

bert_hidden_size = bert.local_parameters['hidden_size']

mlp = SequenceClassifier(hidden_size=bert_hidden_size,
                                  num_classes=2,
                                  num_layers=2,
                                  log_softmax=False,
                                  dropout=0.0)

In [None]:
bert.restore_from(checkpoint_dir + '/BERT-EPOCH-3.pt')
mlp.restore_from(checkpoint_dir + '/SequenceClassifier-EPOCH-3.pt')

In [None]:
max_seq_length = 64

if pretrained_bert_model == 'bert-base-uncased':
    batch_size = 256
if pretrained_bert_model == 'bert-large-uncased':
    batch_size = 64
    
test_data = BertTextClassificationDataLayer(
    input_file=data_dir + '/test.tsv',
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    shuffle=False,
    num_samples=-1, # lower for dev, -1 for all dataset
    batch_size=64
)

In [None]:
test_input, test_token_types, test_attn_mask, _ = test_data()

In [None]:
test_embeddings = bert(input_ids=test_input,
                        token_type_ids=test_token_types,
                        attention_mask=test_attn_mask)

In [None]:
test_logits = mlp(hidden_states=test_embeddings)

In [None]:
%%time
test_logits_tensors = nf.infer(tensors=[test_logits])

In [None]:
test_probs = f.softmax(torch.cat(test_logits_tensors[0])).numpy()[:, 1] 

In [None]:
test_df = pd.read_csv(data_dir + '/test.tsv', sep='\t')

In [None]:
test_df['prob'] = test_probs 

In [None]:
test_df

In [None]:
test_df.to_csv(inference_log_dir + '/test_inference.tsv', sep='\t', index=False)

In [None]:
def sample_classification(data_path):
    df = pd.read_csv(data_path, sep='\t')
    sample = df.sample()
    sentence = sample.sentence.values[0]
    prob = sample.prob.values[0]
    result = f'{sentence} | {prob}'
    return result

In [None]:
sample_classification(inference_log_dir + '/test_inference.tsv')

In [None]:
num_samples = 10
for _ in range(num_samples):
    print(sample_classification(inference_log_dir + '/test_inference.tsv'))

## BERT nails it:
the film is just a big , gorgeous , mind-blowing , breath-taking mess . | 0.2738656

a sensual performance from abbass buoys the flimsy story , but her inner journey is largely unexplored and we 're left wondering about this exotic-looking woman whose emotional depths are only hinted at . | 0.48260054

## Classify my sentence

In [None]:
def classify_sentence(nf, tokenizer, bert, mlp, sentence):
    sentence = sentence.lower()
    tmp_file = "data/tmp_sentence.tsv"
    with open(tmp_file, 'w+') as tmp_tsv:
        header = 'sentence\tlabel\n'
        line = sentence + '\t0\n'
        tmp_tsv.writelines([header, line])

    tmp_data = BertTextClassificationDataLayer(
        input_file=tmp_file,
        tokenizer=tokenizer,
        max_seq_length=128,
        shuffle=False,
        num_samples=-1, # lower for dev, -1 for all dataset
        batch_size=1
    )
    tmp_input, tmp_token_types, tmp_attn_mask, _ = tmp_data()
    tmp_embeddings = bert(input_ids=tmp_input,
                            token_type_ids=tmp_token_types,
                            attention_mask=tmp_attn_mask)
    tmp_logits = mlp(hidden_states=tmp_embeddings)
    tmp_logits_tensors = nf.infer(tensors=[tmp_logits, tmp_embeddings])
    tmp_probs = f.softmax(torch.cat(tmp_logits_tensors[0])).numpy()[:, 1] 
    print(f'{sentence} | {tmp_probs[0]}')

In [None]:
#sentence = 'point break is the best movie of all time'
#sentence = 'the movie was a wonderful exercise in understanding the struggles of native americans'
#sentence = 'the performance of diego luna had me excited and annoyed at the same time'
sentence = 'matt damon is the only good thing about this film'
classify_sentence(nf, tokenizer, bert, mlp, sentence)

## Understanding and Visualizing BERT Embeddings

Now that we've fine-tuned our BERT model, let's see if the word embeddings have changed.

In [None]:
data_path = 'data/positive_negative.tsv'
# positive negative spectrum
spectrum_data = BertTextClassificationDataLayer(
    input_file=data_path,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    shuffle=False,
    num_samples=-1, # lower for dev, -1 for all dataset
    batch_size=batch_size,
    dataset_type=BertTextClassificationDataset
)

spectrum_input, spectrum_token_types, spectrum_attn_mask, spectrum_labels = spectrum_data()

spectrum_embeddings = bert(input_ids=spectrum_input,
                        token_type_ids=spectrum_token_types,
                        attention_mask=spectrum_attn_mask)

spectrum_embeddings_tensors = nf.infer(tensors=[spectrum_embeddings])

plt.figure(figsize=(100,100))
plt.imshow(spectrum_embeddings_tensors[0][0][:,0,:].numpy())

In [None]:
spectrum_df = pd.read_csv(data_path, delimiter='\t')

spectrum_activations = spectrum_embeddings_tensors[0][0][:,0,:].numpy()
tsne_spectrum = TSNE(n_components=2, perplexity=10, verbose=1, learning_rate=2,
                     random_state=123).fit_transform(spectrum_activations)

fig = plt.figure(figsize=(10,10))
plt.plot(tsne_spectrum[0:11, 0], tsne_spectrum[0:11, 1], 'rx')
plt.plot(tsne_spectrum[11:, 0], tsne_spectrum[11:, 1], 'bo')
for (x,y, label) in zip(tsne_spectrum[0:, 0], tsne_spectrum[0:, 1], spectrum_df.sentence.values.tolist() ):
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center

In [None]:
spectrum_activations = spectrum_embeddings_tensors[0][0][:,0,:].numpy()
pca_spectrum = PCA(n_components=2).fit_transform(spectrum_activations)

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(pca_spectrum[0:11, 0], pca_spectrum[0:11, 1], 'rx')
ax.plot(pca_spectrum[11:, 0], pca_spectrum[11:, 1], 'bo')
for (x,y, label) in zip(pca_spectrum[0:, 0], pca_spectrum[0:, 1], spectrum_df.sentence.values.tolist() ):
    plt.annotate(label, # this is the text
                 (x,y), # this is the point to label
                 textcoords="offset points", # how to position the text
                 xytext=(0,10), # distance from text to points (x,y)
                 ha='center') # horizontal alignment can be left, right or center