# Embedding

Em vez de substituir cada palavra por um número, iremos identificar as palavras como vetores em um espaço de n-dimensões.

O processo de Embedding é fazer com que um conjunto de palavras ou palavras associadas formem vetores em um espaço multidimensional

Nesse semana de curso, será usado o dataset do IMDB, para podermos construir um classificador de críticas em relação a um filme.

In [22]:
# Instalando os dataset do tensorflow
#!pip install --upgrade pip
#!pip install -q tensorflow-datasets

# 0.0 Imports

In [82]:
import tensorflow as tf
#import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import re
import io

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#print(tf.__version__)

## 0.1 loading Data

In [24]:
imdb = pd.read_csv(r'D:\Cursos_DS\NLP_coursera\Data\IMDB_reviews\IMDB Dataset.csv' )

# 1.0 Data Visualization

In [25]:
# Tamanho do Dataset
len(imdb)

50000

## 1.1 Data Types

In [26]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [29]:
imdb['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# 2.0 Data Preparation

## 2.1 Apply function

In [50]:
# Remove as tags de HTML
def _remove_tags_html(text):
    text = re.sub('<[^<]+?>', '', str(text))
    return text


# Remove e-mail
def _remove_email(text):
    text = ' '.join([w for w in text.split() if '@' not in w])
    return text

# Remove os Emojis
def _remove_emoji(string):
    emoji_pattern = re.compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
                "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def _sub_char_especial_normal(text):
    # Substitui caracteres especiais por letras com acentos

    # Letra minúsculas
    text = re.sub(r"&aacute;", "á", text)
    text = re.sub(r"&atilde;", "ã", text)
    text = re.sub(r"&agrave;", "à", text)
    text = re.sub(r"&acirc;", "â", text)
    text = re.sub(r"&aring;", "a", text)
    text = re.sub(r"&auml;", "a", text)
    text = re.sub(r"&aelig;", "ae", text)
    text = re.sub(r"&eacute;", "é", text)
    text = re.sub(r"&ecirc;", "ê", text)
    text = re.sub(r"&egrave;", "è", text)
    text = re.sub(r"&euml;", "e", text)
    text = re.sub(r"&iacute;", "í", text)
    text = re.sub(r"&icirc;", "î", text)
    text = re.sub(r"&iuml;", "i", text)
    text = re.sub(r"&oacute;", "ó", text)
    text = re.sub(r"&otilde;", "õ", text)
    text = re.sub(r"&ocirc;", "ô", text)
    text = re.sub(r"&ograve;", "ò", text)
    text = re.sub(r"&uacute;", "ú", text)
    text = re.sub(r"&ucirc;", "û", text)
    text = re.sub(r"&ugrave;", "ù", text)
    text = re.sub(r"&uuml;", "u", text)
    text = re.sub(r"&ccedil;", "ç", text)
    text = re.sub(r"&ntilde;", "não", text)

    # Letras maiúsculas
    text = re.sub(r"&Aacute;", "Á", text)
    text = re.sub(r"&Atilde;","Ã", text) # Letra maiúscula
    text = re.sub(r"&Agrave;", "À", text)
    text = re.sub(r"&Acirc;", "Â", text)
    text = re.sub(r"&Aring;", "A", text)
    text = re.sub(r"&Auml;", "A", text)
    text = re.sub(r"&Aelig;", "AE", text)
    text = re.sub(r"&Eacute;", "É", text)
    text = re.sub(r"&Ecirc;", "Ê", text)
    text = re.sub(r"&Egrave;", "È", text)
    text = re.sub(r"&Euml;", "E", text)
    text = re.sub(r"&Iacute;", "Í", text)
    text = re.sub(r"&Icirc;", "Î", text)
    text = re.sub(r"&Iuml;", "I", text)
    text = re.sub(r"&Oacute;", "Ó", text)
    text = re.sub(r"&Otilde;", "Õ", text)
    text = re.sub(r"&Ocirc;", "Ô", text)
    text = re.sub(r"&Ograve;", "Ò", text)
    text = re.sub(r"&Uacute;", "Ú", text)
    text = re.sub(r"&Ucirc;", "Û", text)
    text = re.sub(r"&Ugrave;", "Ù", text)
    text = re.sub(r"&Uuml;", "U", text)
    text = re.sub(r"&Ccedil;", "Ç", text)
    text = re.sub(r"&Ntilde;", "Não", text)
    text = re.sub(r"&Yacute;", "Y", text)

    #any special char to be replaced
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&amp;", "&", text)
    text = re.sub(r"&reg;", "&", text) 
    text = re.sub(r"&copy;", "&", text) 
    text = re.sub(r"&quot;","'", text)
    text = re.sub(r"&lsquo;","'", text)
    text = re.sub(r"&rsquo;","'", text)
    return text

# Termos em code HTML
def _remove_char_html(text):
    text = re.sub(r"&ndash;","", text)
    text = re.sub(r"&ordf;","", text)
    text = re.sub(r"&nbsp;","", text)
    text = re.sub(r"&cent;","", text)
    text = re.sub(r"&pound;","", text)
    text = re.sub(r"&sect;","", text)
    text = re.sub(r"&copy;","", text)
    text = re.sub(r"&laquo;","", text)
    text = re.sub(r"&raquo;","", text)
    text = re.sub(r"&reg;","", text)
    text = re.sub(r"&deg;","", text)
    text = re.sub(r"&plusmn;","", text)
    text = re.sub(r"&para;","", text)
    text = re.sub(r"&middot;","", text)
    text = re.sub(r"&frac12;","", text)
    text = re.sub(r"&mdash;","", text)
    text = re.sub(r"&lsquo;","", text)
    text = re.sub(r"&rsquo;","", text)
    text = re.sub(r"&sbquo;","", text)
    text = re.sub(r"&ldquo;","", text)
    text = re.sub(r"&rdquo;","", text)
    text = re.sub(r"&bdquo;","", text)
    text = re.sub(r"&dagger;","", text)
    text = re.sub(r"&Dagger;","", text)
    text = re.sub(r"&bull;","", text)
    text = re.sub(r"&hellip;","", text)
    text = re.sub(r"&prime;","", text)
    text = re.sub(r"&Prime;","", text)
    text = re.sub(r"&euro;","", text)
    text = re.sub(r"&trade;","", text)
    text = re.sub(r"&asymp;","", text)
    text = re.sub(r"&ne;","", text)
    text = re.sub(r"&le;","", text)
    text = re.sub(r"&ge;","", text)
    text = re.sub(r"&lt;","", text)
    text = re.sub(r"&gt;","", text)
    return text

SyntaxError: EOL while scanning string literal (209614605.py, line 103)

In [37]:
def transform_text(text):
    text = _remove_tags_html(text)
    text = _remove_email(text)
    text = _remove_emoji(text)
    text = _sub_char_especial_normal(text)
    text = _remove_char_html(text)
    return text

In [38]:
# Aplicando a transformação nas reviews
imdb['review']=imdb['review'].apply(str)
imdb['review_transform'] = imdb['review'].apply(transform_text)

## 2.2 Map labels

In [39]:
dictionary = {  'positive':1,
                'negative':0}

imdb['sentiment_num'] = imdb['sentiment'].map(dictionary)

In [40]:
imdb.head()

Unnamed: 0,review,sentiment,sentiment_num,review_transform
0,One of the other reviewers has mentioned that ...,positive,1,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,1,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,positive,1,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,0,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,"Petter Mattei's ""Love in the Time of Money"" is..."


## 2.3 Train and test

In [66]:
train_data = imdb[['review_transform','sentiment_num']].iloc[:25000]
test_data = imdb[['review_transform','sentiment_num']].iloc[25000:]

In [67]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for l in train_data['sentiment_num']:
    training_labels.append(l)
for s in train_data['review_transform']:
    training_sentences.append(str(s))

for l in test_data['sentiment_num']:
    testing_labels.append(l)
for s in test_data['review_transform']:
    testing_sentences.append(str(s))


In [68]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

# 3.0 Preprocessing

## 3.1 Configuration

In [69]:
vocab_size = 10000
embeding_dim = 16
max_lenght = 1000
trunc_type = 'post'
oov_tok = 'XXXXXX'

## 3.2 Tokenization

In [70]:
tokenizer = Tokenizer(  num_words=vocab_size,
                        oov_token= oov_tok)

tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences( sequences, 
                        maxlen=max_lenght,
                        truncating = trunc_type)

In [71]:
# Tokens do vocabulário no exame de teste.
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences( testing_sequences,
                                maxlen = max_lenght)

# 4.0 Model - Rede Neural

## 4.1 Construindo rede neural - Model_1

In [72]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length = max_lenght), # Aqui é onde a magia realmente acontece
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

In [73]:
model.compile(  loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [74]:
model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1000, 16)          160000    
                                                                 
 flatten_1 (Flatten)         (None, 16000)             0         
                                                                 
 dense_6 (Dense)             (None, 6)                 96006     
                                                                 
 dense_7 (Dense)             (None, 1)                 7         
                                                                 
Total params: 256,013
Trainable params: 256,013
Non-trainable params: 0
_________________________________________________________________


### 4.1.2 Model fit

In [75]:
num_epochs = 10


In [76]:
history = model.fit(  padded,
            training_labels_final,
            epochs = num_epochs,
            validation_data = (testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 4.2 Construindo rede Neural - Model_2

In [62]:
model_2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length = max_lenght), # Aqui é onde a magia realmente acontece
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

In [63]:
model_2.compile(  loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [64]:
model_2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1000, 16)          160000    
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 6)                 102       
                                                                 
 dense_5 (Dense)             (None, 1)                 7         
                                                                 
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [65]:
model_2.fit(  padded,
            training_labels_final,
            epochs = num_epochs,
            validation_data = (testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f7e2ba5130>

# 5.0 Embeddings

In [77]:
# Obtendo os resultados da camada de embedding (primeira camada)
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # Shape: (Vocab_size, embedding_dim)
# 10000 Palavras e 16 dimensões

(10000, 16)


In [78]:
# Para sermos capazes de plotar as palavras como vetores, devemos o índice de palavras
reverse_word_index = dict([(value, key) for (key,value) in word_index.items()])

### 5.2 Escrevendo os vetores em formatos de metadados

In [83]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

In [84]:
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_m.close()
out_v.close()

O projecto TensorFlow lê esse tipo de arquivo e usa os dados para plotar um gráfico em 3D