# TextRegognition with GluonCV

In [1]:
!pip install -q gluonnlp

In [8]:
!pip install -U regex

Defaulting to user installation because normal site-packages is not writeable
Collecting regex
  Downloading regex-2020.11.13-cp37-cp37m-manylinux2014_x86_64.whl (719 kB)
[K     |████████████████████████████████| 719 kB 12.9 MB/s eta 0:00:01
[?25hInstalling collected packages: regex
Successfully installed regex-2020.11.13


In [3]:
# Imports
import numpy as np
import mxnet as mx
import gluonnlp as nlp

# https://github.com/dmlc/gluon-nlp
import text_generation.model

In [4]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Data Science Academy" --iversions

mxnet    1.7.0
json     2.0.9
autopep8 1.4.4
numpy    1.18.1
gluonnlp 0.10.0
Data Science Academy


In [5]:
# Vamos alterar o dispositivo para CPU (GPU não é necessário neste estudo de caso)
ctx = mx.cpu()

In [6]:
# Vamos importar o modelo pré-treinado para geração de texto
model, vocab = text_generation.model.get_model(name = 'gpt2_117m',
                                               dataset_name = 'openai_webtext',
                                               pretrained = True,
                                               ctx = ctx)

Vocab file is not found. Downloading.
Downloading /home/lucas/.mxnet/models/2492485986095506788/2492485986095506788_openai_webtext-f917dc78.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/openai_webtext-f917dc78.zip...
Downloading /home/lucas/.mxnet/models/gpt2_117m_openai_webtext-26416f2e.zipa24a24d8-1f33-42b1-98ba-27e7d4956595 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/gpt2_117m_openai_webtext-26416f2e.zip...


In [9]:
# Criamos então o tokenizador
tokenizer = nlp.data.GPT2BPETokenizer()

BPE rank file is not found. Downloading.
Downloading /home/lucas/.mxnet/models/1605787627.4260728openai_webtext_bpe_ranks-396d4d8e.json from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/openai_webtext_bpe_ranks-396d4d8e.zip...


In [10]:
# E também o objeto para remover a tokenização (usaremos para mostrar o texto gerado)
detokenizer = nlp.data.GPT2BPEDetokenizer()

In [11]:
# Definimos o token de final de texto
eos_id = vocab[vocab.eos_token]
print(vocab.eos_token)

<|endoftext|>


## Sequence Sampler

In [53]:
# Esta string será usada como ponto de partida para a geração de texto
bos_str = 'School Teacher and students'

In [54]:
# Adicionamos um espaço no início da string
if not bos_str.startswith(' '):
    bos_str = ' ' + bos_str

In [55]:
# Tokenizamos a string
bos_tokens = tokenizer(bos_str)

In [56]:
# Geramos o vocabulário com os tokens
bos_ids = vocab[bos_tokens]
print(bos_tokens)

['ĠSchool', 'ĠTeacher', 'Ġand', 'Ġstudents']


In [57]:
# Classe para o decoder
class GPT2Decoder(text_generation.model.LMDecoder):
    def __call__(self, inputs, states):
        
        # Recebe os inputs
        inputs = inputs.expand_dims(axis = 1)
        
        # Gera as saídas
        out, new_states = self.net(inputs, states)
        
        # Reshape das saídas
        out = mx.nd.slice_axis(out, axis = 1, begin = 0, end = 1).reshape((inputs.shape[0], -1))
        
        return out, new_states

In [58]:
# Cria o objeto
decoder = GPT2Decoder(model)

In [59]:
# Função para o estado inicial
def get_initial_input_state(decoder, bos_ids, temperature):
    
    # Inputs e estado inicial
    inputs, begin_states = decoder.net(mx.nd.array([bos_ids], dtype = np.int32, ctx = ctx), None)
    
    # Reshape dos inputs
    inputs = inputs[:, -1, :]
    
    # Probabilidades (observe o parâmetro de temperatura)
    smoothed_probs = (inputs / temperature).softmax(axis = 1)
    
    # Amostra multidimensional
    inputs = mx.nd.sample_multinomial(smoothed_probs, dtype = np.int32)
    
    return inputs, begin_states

In [60]:
# Hiperparâmetros do modelo
beam_size = 2
temperature = 0.97
num_results = 2
max_len = 256 - len(bos_tokens)

In [61]:
# Cria o sampler
sampler = nlp.model.SequenceSampler(beam_size = beam_size,
                                    decoder = decoder,
                                    eos_id = eos_id,
                                    max_length = max_len,
                                    temperature = temperature)

In [62]:
# Função para geração de texto
def generate(decoder, bos_ids, temperature, sampler, num_results, vocab):
    
    # Inputs e estado inicial
    inputs, begin_states = get_initial_input_state(decoder, bos_ids, temperature)
    
    # Amostras, escores e comprimentos válidos
    samples, scores, valid_lengths = sampler(inputs, begin_states)
    
    # Converte amostras, scores e comprimentos válidos para o formato numpy
    samples = samples[0].asnumpy()
    scores = scores[0].asnumpy()
    valid_lengths = valid_lengths[0].asnumpy()

    # Resultado
    print('\nResultado Gerado:\n')
    for i in range(num_results):
        
        # Gera os tokens (novo texto)
        generated_tokens = [vocab.idx_to_token[ele] for ele in samples[i][:valid_lengths[i]]]
        
        # Adiciona os tokens gerados ao texto inicial
        tokens = bos_tokens + generated_tokens[1:]
        
        # Desfaz a tokenização para mostrar o resultado no formato de texto
        print([detokenizer(tokens).strip(), scores[i]])

In [63]:
# Executa o gerador de texto
generate(decoder, bos_ids, temperature, sampler, num_results, vocab)


Resultado Gerado:

['School Teacher and students color. So dominant was that agenda for her that knowing she is still a Monument to Green was almost impossible in 1995. Peter and Anna were students at arts college and they were especially sad mom and say mom couples," said the boy from Fort Collins So efforts to the fourteen year old, a notch among others that day to rediscover his hate and of NNipancity community \'nn\'r all. No ing aeganaimes moments on1.\n\n\nI long journey out and learn soonted atlevend Burst sharing the ratio, Vikings globered a marriage to weak religion. An element of Nood meaning that fear for Mountain and dont uot aya of loch of motion under pulses tillNutus, Noodu. But gent cause nidenukul to wrongtde wors mor ageo utall. . Nor city and workaround because monk . God.\n faith among not ration himself in poor de certirers nisi tiis whenuka on nuke ginia uld limit rather Nkr ulole 2\nSee unsrew by in AI their lener drummer abf n ]. chonson with any madinda lazed

## Beam Search Sampler

In [50]:
# Cria o scorer, que vai definir a intensidade da decodificação
scorer = nlp.model.BeamSearchScorer(alpha = 0, K = 5, from_logits = False)

In [51]:
# Cria o sampler
beam_sampler = nlp.model.BeamSearchSampler(beam_size = 4,
                                           decoder = decoder,
                                           eos_id = eos_id,
                                           scorer = scorer,
                                           max_length = max_len)

In [52]:
# Gera o texto
generate(decoder, bos_ids, temperature, beam_sampler, num_results, vocab)


Resultado Gerado:

['Soccer, basketball and baseball the same time.\n\n"I think it\'s a great opportunity for us to be able to compete in the same way we compete in the NBA," he said.\n\n"I think it\'s a great opportunity for us to be able to compete in the same way we compete in the NBA."\n\n\n"I think it\'s a great opportunity for us to be able to be able to compete in the same way we compete in the NBA."\n\n\n\n"I think it\'s a great opportunity to be able to be able to be able to compete in the same way we compete in the same way we compete in the NBA."\n\n\n\n"I think it\'s great opportunity to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able