<a href="https://colab.research.google.com/github/Kaiziferr/NLP_Workshop/blob/master/BERT/02_embedding_BERT_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [None]:
# Install BERT
!pip install bert-for-tf2
# Install sentencepiece permmite llamar correctamente a BERT
!pip install sentencepiece



In [None]:
import tensorflow as tf
# Modulos comunidad => PErmite descargar los pesos con los que google entreno a BERT
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

# **Preprocesado de datos**

---
## Carga de los datos


In [None]:
# ficheros desde Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cols = ['sentiment', 'id', 'date', 'query', 'user', 'text']
data = pd.read_csv('/content/drive/MyDrive/IA/BERT/training.csv', header = None, names = cols, engine = 'python', encoding='latin1')

In [None]:
data.head

<bound method NDFrame.head of          sentiment  ...                                               text
0                0  ...  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1                0  ...  is upset that he can't update his Facebook by ...
2                0  ...  @Kenichan I dived many times for the ball. Man...
3                0  ...    my whole body feels itchy and like its on fire 
4                0  ...  @nationwideclass no, it's not behaving at all....
...            ...  ...                                                ...
1599995          4  ...  Just woke up. Having no school is the best fee...
1599996          4  ...  TheWDB.com - Very cool to hear old Walt interv...
1599997          4  ...  Are you ready for your MoJo Makeover? Ask me f...
1599998          4  ...  Happy 38th Birthday to my boo of alll time!!! ...
1599999          4  ...  happy #charitytuesday @theNSPCC @SparksCharity...

[1600000 rows x 6 columns]>

In [None]:
data.drop(['id', 'date', 'query', 'user'], axis = 1, inplace = True)

In [None]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## **Preprocessing**

In [None]:
def clean_tweet(tweet):
  soup = BeautifulSoup(tweet, 'lxml').get_text()
  # Deleted @
  soup = re.sub(r'@[a-zA-Z0-9]+', ' ', soup)
  # Deleted URL
  soup = re.sub(r'https?://[A-Za-z0-9./]+', ' ', soup)
  # keep only letters
  soup = re.sub(r"[^a-zA-Z.!?']", " ", soup)
  # Add space
  soup = re.sub(r" +", ' ',soup)
  return soup

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_clean[0:5]

[" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!",
 ' I dived many times for the ball. Managed to save The rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 " no it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "]

In [None]:
# En la http://help.sentiment140.com/for-students se indica que en la columna sentimiento los valores 0 son negativos y los 4 positivos, por lo tanto
# se remplaza el valor 4 por 1
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

## **Tokenizador** (Primer capa)

---



Se aplica la capa de BERT, para tener acceso a los meta datos para el tokenizador (como el tamaño del vocabulario)

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
# Capa Bert para procesar, se trae la arquitectura, parametros, pesos
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=False)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
def encode_sentence(sent):
  return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

## **Creación Dataset**

---



Se debe crear el padded batches (Por lo que se rellena las frases para cada lote de forma independiente), de esta forma se debe añadir el minimo de números de tokens de padding posible. Para eso, se debe ordenar las frases por logitud, aplicando padded_batches y luego se mezclan

In [None]:
# Tokens numericos
def get_ids(tokens):
  return tokenizer.convert_tokens_to_ids(tokens)

# Aplica la mascara correspondiente a los tokens de padding, retornara 0 cuando hallan tokens PAD
def get_mask(tokens):
  return np.char.not_equal(tokens, "[PAD]").astype(int)

# Se utilizaran ceros para indicar el fragmento de la primera frase, cuando encuentre el token [SEP] se usara unos para indicar el otro fragmento
def get_segments(tokens):
  seg_ids = []
  current_seg_id = 0
  for tok in tokens:
    seg_ids.append(current_seg_id)
    if tok == "[SEP]":
      current_seg_id = 1-current_seg_id
  return seg_ids

In [None]:
# sentencia, la etiqueta, dimensión
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_inputs)]

In [None]:
# Aleatorización
random.shuffle(data_with_len)

In [None]:
# Ordenar por dimension, de la mas corta a la larga
data_with_len.sort(key=lambda x: x[2])

In [None]:
# Eliminar elementos con poca dimension
# Los identificadores para la frase
# La mascara para la misma frase
# Segmentos para la propia frase
data_all = [([get_ids(sent_lab[0]), get_mask(sent_lab[0]), get_segments(sent_lab[0])], sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
# Generador "arreglar" las oración que no posena la misma dimensión, la ideas que posean la misma dimension
#  data_all la lista con el dato y la etiqueta. output_types es el tipo de dato de salida
all_dataset = tf.data.Dataset.from_generator(lambda: data_all, output_types=(tf.int32, tf.int32))

In [None]:
# Hay que definir  padding y asu vez se debe definir el batch size
# El entrenamiento se hara en blouqes de 32 frases
# Tamaño del padding padded_shapes = ((None, ), ()
# El primer elemento indicara la dimensión de las frases
# El segundo elemento indica la dimensión de las etiquetas () Indica que se deje la dimensión como se encuentre
BATCH_SIZE = 32
all_batche = all_dataset.padded_batch(BATCH_SIZE, padded_shapes = ((3, None), ()), padding_values=(0, 0))

In [None]:
# Cada bloque tiene 32 frases
# Cada frase es de tamaño 8
# El arreglo de salida indica que las estiquetas estan en desorden, por lo tanto reduce el sesgo en los registros en los posteriores procedimientos
next(iter(all_batche))


(<tf.Tensor: shape=(32, 3, 8), dtype=int32, numpy=
 array([[[  101, 10514,  2361,  2283,  3902,   999,  1029,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  2851,  4299,  2071,  3637,  2070,  2062,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  1045,  3335,  2017, 16216,  2319,   999,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  2216, 12665,  2033,  1037,  2210,   999,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],
         [    0,     0,     0,     0,     0,     0,     0,     0]],
 
        [[  101,  1045,  6639,  2007,  2008,  3720,  2205,   102],
         [    1,     1,     1,     1,     1,     1,     1,     1],

In [None]:
# Crear un conjunto de datos de entrnamiento y de test
NB_BATCHES = math.ceil(len(data_all)/BATCH_SIZE)
# 10% Entrenar
NB_BATCHES_TEST = NB_BATCHES // 10
# Aleatorizar los lotes, para que no esten sesgado
all_batche.shuffle(NB_BATCHES)
# Tomo el 10%
test_dataset = all_batche.take(NB_BATCHES_TEST)
# El 80%
train_dataset = all_batche.skip(NB_BATCHES_TEST)

In [None]:
my_sent = ["[CLS]"] + tokenizer.tokenize("Roses are red.") + ["[SEP]"]

In [None]:
print(my_sent)

['[CLS]', 'roses', 'are', 'red', '.', '[SEP]']


In [None]:
tf.expand_dims(tf.cast(get_segments(my_sent), tf.float32),0)

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=array([[0., 0., 0., 0., 0., 0.]], dtype=float32)>

In [None]:
# La primera salida es información que se utiliza para clasificación
# La segunda es tokenización por palabra
# 
bert_layer([tf.expand_dims(tf.cast(get_ids(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_mask(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_segments(my_sent), tf.int32),0)])

[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-9.27935421e-01, -4.10335243e-01, -9.65754986e-01,
          9.07317698e-01,  8.12913716e-01, -1.74174413e-01,
          9.11234379e-01,  3.41952085e-01, -8.74521196e-01,
         -9.99989390e-01, -7.78409779e-01,  9.69385147e-01,
          9.86160517e-01,  6.36963248e-01,  9.48631287e-01,
         -7.51192927e-01, -4.58339483e-01, -7.08104432e-01,
          4.62098330e-01, -6.57926798e-01,  7.60414362e-01,
          9.99994695e-01, -3.96861076e-01,  3.44166100e-01,
          6.16488576e-01,  9.94400024e-01, -7.76633620e-01,
          9.38316405e-01,  9.59452212e-01,  7.32879162e-01,
         -6.93436623e-01,  2.93080419e-01, -9.93785441e-01,
         -1.64551854e-01, -9.67019558e-01, -9.95549619e-01,
          5.32935262e-01, -6.88060999e-01,  1.34716183e-02,
          2.98195966e-02, -9.18356478e-01,  4.20526266e-01,
          9.99988914e-01,  2.52676159e-01,  6.06235325e-01,
         -3.50750089e-01, -1.00000000e+00,  4.975

# **MODEL**

---



In [None]:
class DCNNBERTEmbedding(tf.keras.Model):

  #  vocab_size     : Tamaño del vocabulario dado por el tokenizador
  #  emb_dim        : Capa de incrustraciones
  #  nb_filters     : Numero de filtros
  #  FFN_units      : Numero de neuronas de la capa oculta
  #  nb_classes     : Tipo de salid binario 
  #  dropout_rate   :
  #  training       : Indicar la fase de entrenamiento
  #  name           : nombre al modelo




  def __init__(self,  nb_filters = 50, FFN_units = 512, nb_classes = 2, dropout_rate = 0.1, name = "dcnn"):
    super(DCNNBERTEmbedding, self).__init__(name = name)

    self.bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=False)

    # Capas de convolución
    # Como el stride es 1 por defecto, no importa el valor del padding
    # Operación convolucional unidimensional
    # Anhura la define la dimensión del embedding
    # Analizan dos palabras consecutivas
    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size = 2, padding='valid', activation='relu')
    # Analizan tres palabras consecutivas
    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size = 3, padding='valid', activation='relu')
    # Analizan cuatro palabras consecutivas
    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size = 4, padding='valid', activation='relu')

    # Se queda con el maximo de todos los valores (bigram, trigram, fourgram) 
    self.pool = layers.GlobalMaxPool1D()

    self.dense_1 = layers.Dense(FFN_units, activation = 'relu')
    self.dropout = layers.Dropout(rate = dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1, activation = "sigmoid")
    else:
      self.last_dense = layers.Dense(units=1, activation = "softmax")

  def embed_with_bert(self, all_tokens):
    _, embs = self.bert_layer([all_tokens[:, 0, :],
                               all_tokens[:, 1, :],
                               all_tokens[:, 2, :]])
    return embs

  def call(self, inputs, training):
    X = self.embed_with_bert(inputs)
    X_1 = self.bigram(X) 
    X_1 = self.pool(X_1)
    X_2 = self.trigram(X)
    X_2 = self.pool(X_2)
    X_3 = self.fourgram(X)
    X_3 = self.pool(X_3)

    merged = tf.concat([X_1, X_2, X_3], axis = -1)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)
    
    return output

# **Entrenamiento**

---



In [None]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [None]:
model = DCNNBERTEmbedding(nb_filters=NB_FILTERS, FFN_units=FFN_UNITS, nb_classes=NB_CLASSES, dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
  model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
else:
  model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
checkpoin_path = "/content/drive/MyDrive/IA/BERT/checkpoin2/"
ckpt = tf.train.Checkpoint(Dccn=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoin_path, max_to_keep = 1)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Ultimo checkpoint restaurado!!')

In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs = None):
    ckpt_manager.save()
    print(f'Checkpoint guardado en {checkpoin_path}')

In [None]:
model.fit(train_dataset, epochs=NB_EPOCHS, callbacks=[MyCustomCallback()])

Epoch 1/5


KeyboardInterrupt: ignored

# **Evaluación**

---



In [None]:
results = model.evaluate(test_dataset)
print(results)

In [None]:
def get_prediction(sentence):
  token = encode_sentence(sentence)
  # Añade una dimensión adicional
  inputs = tf.expand_dims(token, 0)
  output = model(inputs, training = False)
  sentiment = math.floor(output*2)
  if sentiment == 0:
    print(f"Salida del modelo: {output}\n Sentimiento predicho es negativo")
  elif sentiment == 1:
    print(f"Salida del modelo: {output}\n Sentimiento predicho es positivo")

In [None]:
get_prediction("Crap, crap and totally crap. Did I mention this film was totally crap? Well, it's totally crap")