<a href="https://colab.research.google.com/github/Kaiziferr/NLP_Workshop/blob/master/BERT/01_tokenizador_BERT_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [6]:
# Install BERT
!pip install bert-for-tf2
# Install sentencepiece permmite llamar correctamente a BERT
!pip install sentencepiece



In [7]:
import tensorflow as tf
# Modulos comunidad => PErmite descargar los pesos con los que google entreno a BERT
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

# **Preprocesado de datos**

---
## Carga de los datos


In [8]:
# ficheros desde Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
cols = ['sentiment', 'id', 'date', 'query', 'user', 'text']
data = pd.read_csv('/content/drive/MyDrive/IA/BERT/training.csv', header = None, names = cols, engine = 'python', encoding='latin1')

In [10]:
data.head

<bound method NDFrame.head of          sentiment  ...                                               text
0                0  ...  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1                0  ...  is upset that he can't update his Facebook by ...
2                0  ...  @Kenichan I dived many times for the ball. Man...
3                0  ...    my whole body feels itchy and like its on fire 
4                0  ...  @nationwideclass no, it's not behaving at all....
...            ...  ...                                                ...
1599995          4  ...  Just woke up. Having no school is the best fee...
1599996          4  ...  TheWDB.com - Very cool to hear old Walt interv...
1599997          4  ...  Are you ready for your MoJo Makeover? Ask me f...
1599998          4  ...  Happy 38th Birthday to my boo of alll time!!! ...
1599999          4  ...  happy #charitytuesday @theNSPCC @SparksCharity...

[1600000 rows x 6 columns]>

In [11]:
data.drop(['id', 'date', 'query', 'user'], axis = 1, inplace = True)

In [12]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## **Preprocessing**

In [13]:
def clean_tweet(tweet):
  soup = BeautifulSoup(tweet, 'lxml').get_text()
  # Deleted @
  soup = re.sub(r'@[a-zA-Z0-9]+', ' ', soup)
  # Deleted URL
  soup = re.sub(r'https?://[A-Za-z0-9./]+', ' ', soup)
  # keep only letters
  soup = re.sub(r"[^a-zA-Z.!?']", " ", soup)
  # Add space
  soup = re.sub(r" +", ' ',soup)
  return soup

In [14]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [108]:
data_clean[0:5]

[" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!",
 ' I dived many times for the ball. Managed to save The rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 " no it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "]

In [16]:
# En la http://help.sentiment140.com/for-students se indica que en la columna sentimiento los valores 0 son negativos y los 4 positivos, por lo tanto
# se remplaza el valor 4 por 1
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

## **Tokenizador** (Primer capa)

---



Se aplica la capa de BERT, para tener acceso a los meta datos para el tokenizador (como el tamaño del vocabulario)

In [17]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
# Capa Bert para procesar, se trae la arquitectura, parametros, pesos
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=False)

In [18]:
# Obtener el diccionario completo, hace parte de BERT
vocab = bert_layer.resolved_object.vocab_file.asset_path.numpy()
vocab

b'/tmp/tfhub_modules/03d6fb3ce1605ad9e5e9ed5346b2fb9623ef4d3d/assets/vocab.txt'

In [19]:
# Minusculas convertido a minusculas
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [20]:
tokenizer = FullTokenizer(vocab, do_lower_case)

In [21]:
# Tokens por palabra
tokenizer.tokenize("My dog lovers strawberries.")

['my', 'dog', 'lovers', 'straw', '##berries', '.']

In [22]:
# Tokens por numeros
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog lovers strawberries."))

[2026, 3899, 10205, 13137, 20968, 1012]

In [23]:
# De numeros a letras
tokenizer.convert_ids_to_tokens([2026, 3899, 10205, 13137, 20968, 1012])

['my', 'dog', 'lovers', 'straw', '##berries', '.']

In [24]:
def encode_sentence(sent):
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [25]:
data_input = [encode_sentence(sentence) for sentence in data_clean]

## **Creación Dataset**

---



Se debe crear el padded batches (Por lo que se rellena las frases para cada lote de forma independiente), de esta forma se debe añadir el minimo de números de tokens de padding posible. Para eso, se debe ordenar las frases por logitud, aplicando padded_batches y luego se mezclan

In [27]:
# sentencia, la etiqueta, dimensión
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_input)]

In [28]:
# Aleatorización
random.shuffle(data_with_len)

In [29]:
# Ordenar por dimension
data_with_len.sort(key=lambda x: x[2])

In [30]:
# Eliminar elementos con poca dimension
data_all = [(sent_lab[0], sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 7]

In [31]:
# Generador "arreglar" las oración que no posena la misma dimensión, la ideas que posean la misma dimension
#  data_all la lista con el dato y la etiqueta. output_types es el tipo de dato de salida
all_dataset = tf.data.Dataset.from_generator(lambda: data_all, output_types=(tf.int32, tf.int32))

In [32]:
# Frase con 8 palabras
# Es un sentimiento numpy=0
# There are two tensort, uno de 8 dimensiones y el otro es unidimensional
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([ 4067,  2017,  2005,  1996,  2128,  2102, 28394,  2102],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [33]:
# Hay que definir  padding y asu vez se debe definir el batch size
# El entrenamiento se hara en blouqes de 32 frases
# Tamaño del padding padded_shapes = ((None, ), ()
# El primer elemento indicara la dimensión de las frases
# El segundo elemento indica la dimensión de las etiquetas () Indica que se deje la dimensión como se encuentre
BATCH_SIZE = 32
all_batche = all_dataset.padded_batch(BATCH_SIZE, padded_shapes = ((None, ), ()))

In [34]:
# Cada bloque tiene 32 frases
# Cada frase es de tamaño 8
# El arreglo de salida indica que las estiquetas estan en desorden, por lo tanto reduce el sesgo en los registros en los posteriores procedimientos
next(iter(all_batche))


(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 4067,  2017,  2005,  1996,  2128,  2102, 28394,  2102],
        [ 2316, 10698,  7929,  2156,  2017,  2012,  4830,  8218],
        [ 2008,  1005,  1055,  2200,  3835,  1997,  2017,  4283],
        [ 2293,  1996,  6876,  2520,  2047,  2000, 10474,  1060],
        [ 1045,  2228,  2027,  2024, 14763,  2005,  1037, 23775],
        [ 2067,  2000,  2147,  2651,  1012,  1012,  1012,  1012],
        [ 2166,  2003,  2074,  1037,  4605,  1997, 24188,  5134],
        [ 2633,  8271,  2039,  2013,  2115, 16571,  8840,  2140],
        [ 2064,  1005,  1056,  2644,  3241,  1005, 10094,  1046],
        [ 2003,  2938,  1999,  6370,  2465,   999,  2061, 11771],
        [ 9852, 18411,  2860,  3407,  5798,   999,   999,   999],
        [ 3153,  2035,  2305,  1012,  1012,  1012,  2293,  2009],
        [ 2183,  2000,  5438,  1996,  5194,  1999,  1037,  9587],
        [ 2025,  5458,   999,  5580,  1045,  2106,  2673,  2220],
        [ 6069,  3046,  5329

In [35]:
# Crear un conjunto de datos de entrnamiento y de test
NB_BATCHES = math.ceil(len(data_all)/BATCH_SIZE)
# 10% Entrenar
NB_BATCHES_TEST = NB_BATCHES // 10
# Aleatorizar los lotes, para que no esten sesgado
all_batche.shuffle(NB_BATCHES)
# Tomo el 10%
test_dataset = all_batche.take(NB_BATCHES_TEST)
# El 80%
train_dataset = all_batche.skip(NB_BATCHES_TEST)

# **MODEL**

---



In [76]:
class DCNN(tf.keras.Model):

  #  vocab_size     : Tamaño del vocabulario dado por el tokenizador
  #  emb_dim        : Capa de incrustraciones
  #  nb_filters     : Numero de filtros
  #  FFN_units      : Numero de neuronas de la capa oculta
  #  nb_classes     : Tipo de salid binario 
  #  dropout_rate   :
  #  training       : Indicar la fase de entrenamiento
  #  name           : nombre al modelo




  def __init__(self, vocab_size, emb_dim = 128, nb_filters = 50, FFN_units = 512, nb_classes = 2, dropout_rate = 0.1, training = False, name = "dcnn"):
    super(DCNN, self).__init__(name = name)

    # Capa de incrustración
    self.embedding = layers.Embedding(vocab_size, emb_dim)
    # Capas de convolución
    # Como el stride es 1 por defecto, no importa el valor del padding
    # Operación convolucional unidimensional
    # Anhura la define la dimensión del embedding
    # Analizan dos palabras consecutivas
    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size = 2, padding='valid', activation='relu')
    # Analizan tres palabras consecutivas
    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size = 3, padding='valid', activation='relu')
    # Analizan cuatro palabras consecutivas
    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size = 4, padding='valid', activation='relu')

    # Se queda con el maximo de todos los valores (bigram, trigram, fourgram) 
    self.pool = layers.GlobalMaxPool1D()

    self.dense_1 = layers.Dense(FFN_units, activation = 'relu')
    self.dropout = layers.Dropout(rate = dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1, activation = "sigmoid")
    else:
      self.last_dense = layers.Dense(units=1, activation = "softmax")

  def call(self, inputs, training):
    X = self.embedding(inputs)
    X_1 = self.bigram(X) 
    X_1 = self.pool(X_1)
    X_2 = self.trigram(X)
    X_2 = self.pool(X_2)
    X_3 = self.fourgram(X)
    X_3 = self.pool(X_3)

    merged = tf.concat([X_1, X_2, X_3], axis = -1)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)
    
    return output

# **Entrenamiento**

---



In [37]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [38]:
model = DCNN(vocab_size=VOCAB_SIZE, emb_dim=EMB_DIM, nb_filters=NB_FILTERS, FFN_units=FFN_UNITS, nb_classes=NB_CLASSES, dropout_rate=DROPOUT_RATE)

In [39]:
if NB_CLASSES == 2:
  model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
else:
  model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [40]:
checkpoin_path = "/content/drive/MyDrive/IA/BERT/checkpoin/"
ckpt = tf.train.Checkpoint(Dccn=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoin_path, max_to_keep = 1)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Ultimo checkpoint restaurado!!')

In [41]:
class MyCustomCallback(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs = None):
    ckpt_manager.save()
    print(f'Checkpoint guardado en {checkpoin_path}')

In [42]:
model.fit(train_dataset, epochs=NB_EPOCHS, callbacks=[MyCustomCallback()])

Epoch 1/5
  37196/Unknown - 742s 19ms/step - loss: 0.4298 - accuracy: 0.8019Checkpoint guardado en /content/drive/MyDrive/IA/BERT/checkpoin/
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0316788f90>

# **Evaluación**

---



In [43]:
results = model.evaluate(test_dataset)
print(results)

[0.4424828290939331, 0.8314829468727112]


In [96]:
def get_prediction(sentence):
  token = encode_sentence(sentence)
  # Añade una dimensión adicional
  inputs = tf.expand_dims(token, 0)
  output = model(inputs, training = False)
  sentiment = math.floor(output*2)
  if sentiment == 0:
    print(f"Salida del modelo: {output}\n Sentimiento predicho es negativo")
  elif sentiment == 1:
    print(f"Salida del modelo: {output}\n Sentimiento predicho es positivo")

In [107]:
get_prediction("Crap, crap and totally crap. Did I mention this film was totally crap? Well, it's totally crap")

Salida del modelo: [[0.35293695]]
 Sentimiento predicho es negativo
