[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MaxMitre/Aplicaciones-Financieras/blob/main/Semana11/Work2vec_BERT.ipynb)

# Introducción:

##Comando auxiliares

- Para instalación normal

  - !pip install [nombre_paquete]

- Actualizar

  - !pip install [nombre_paquete] --upgrade

- Forzar el reinstalado
  - !pip install --force-reinstall [nombre_paquete]

- Instalar utilizando APT
  - !apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2



In [None]:
# !pip install --force-reinstall tensorflow
# ==2.8

In [None]:
!pip install tensorflow-text

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2

# Dependencias

In [None]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

from tensorflow.keras import layers
import bert

import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Carga y exploración de datos

In [None]:
reviews = pd.read_csv('/content/drive/MyDrive/Cruso-ApsFinancieras/semana11/DataAnalyst.csv', encoding='utf-8')

In [None]:
reviews

# Limpieza de datos

In [None]:
reviews = reviews[['Job Title', 'Job Description', 'Salary Estimate']]

In [None]:
reviews['Job Description'] = reviews['Job Description'].apply(lambda x: x.replace('\n', ', '))

In [None]:
reviews

# Obtención del valor numérico de salario (quasi-promedio)

In [None]:
reviews['Salary Estimate'].value_counts()

Función auxiliar para limpieza del texto

In [None]:
def get_salaries(sentence):
    salary = re.sub('[^0-9\-]', '', sentence)
    salaries = salary.split('-')
    return salaries

In [None]:
# Ejemplo de uso de la función
cadena = get_salaries(reviews.iloc[-1,2])
cadena

Calculo de los salarios

In [None]:
reviews['minSalary'] = reviews['Salary Estimate'].apply(lambda x: int(get_salaries(x)[0]))

In [None]:
reviews['maxSalary'] = reviews['Salary Estimate'].apply(lambda x: int(get_salaries(x)[1]))

In [None]:
reviews

In [None]:
reviews['meanSalary'] = np.round((reviews['minSalary'] + reviews['maxSalary'])/2 + np.random.normal(0, 2, len(reviews)), decimals=0)

Datos listos

In [None]:
reviews

# Datos de entrenamiento y prueba

In [None]:
X = reviews['Job Description']

In [None]:
X

In [None]:
y = reviews['meanSalary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convertir en elementos apropiados para alimentar el modelo

In [None]:
X_train

In [None]:
X_train_tensor = tf.convert_to_tensor(list(X_train))

In [None]:
X_test

In [None]:
X_test_tensor = tf.convert_to_tensor(list(X_test))

# Definición del modelo

La celda completamente comentada (CELDA 1) se utiliza en lugar de CELDA 2 cuando querramos utilizar una longitud de tokens a utilizar de 128.

En CELDA 2 estaremos utilizando 512 tokens.

Mas info sobre el preprocesador de textos precargado [aquí](https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3)

Mas info sobre el modo en que BERT hace le encaje (embedding) [aquí](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4)

In [None]:
#     CELDA 1

## Para documentación de preprocesado y embedding
## https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 

#text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='first')
#preprocessor = hub.KerasLayer(
#    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
#encoder_inputs = preprocessor(text_input) # aqui estan 'input_type_ids', 'input_mask, 'input_word_ids'

#encoder = hub.KerasLayer(
#    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
#    trainable=True)
#outputs = encoder(encoder_inputs)
#pooled_output = outputs["pooled_output"]      # [batch_size, 768].
#sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].   CON 128 default

In [None]:
#     CELDA 2

preprocessor = hub.load(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

text_inputs = [tf.keras.layers.Input((), dtype=tf.string)]
tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment) for segment in text_inputs]

seq_length = 512  # Your choice here.
bert_pack_inputs = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=seq_length))  # Optional argument.
encoder_inputs = bert_pack_inputs(tokenized_inputs)

encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
    trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, 512, 768].

## Paréntesis cultural: ejemplo de como funciona el tokenizador


In [None]:
tokens_example = tf.keras.Model(text_inputs, encoder_inputs)

In [None]:
X_train.iloc[61]

In [None]:
tokens_example(X_train_tensor)['input_word_ids'][61]

## Encaje (Embedding) de BERT que utilizaremos

Observen el siguiente modelo, entender su salida será muy importante

In [None]:
encaje = tf.keras.Model(text_inputs, sequence_output) 

In [None]:
X_train_encajado = encaje(X_train_tensor)

## Fin del paréntesis cultural

## Estructuración del modelo

In [None]:
input = tf.keras.layers.Input(shape=(512,768, ), name="inputs")

In [None]:
first_conv = tf.keras.layers.Conv1D(64, 4, input_shape=(512, 768))
first_conv_out = first_conv(input)

In [None]:
#other_conv = tf.keras.layers.Conv1D(64, 4)(first_conv_out)

In [None]:
#another_conv = tf.keras.layers.Conv1D(64, 4)(other_conv)

In [None]:
second_conv = tf.keras.layers.MaxPooling1D(64, 509)
second_conv_out = second_conv(first_conv_out)

In [None]:
flatting = tf.keras.layers.Flatten()
flatted = flatting(second_conv_out)

In [None]:
only_dense = tf.keras.layers.Dense(1, activation='linear')
final_output = only_dense(flatted)

In [None]:
embedding_model = tf.keras.Model(input, final_output)

## Auxiliar visual del modelo

In [None]:
tf.keras.utils.plot_model( 
    embedding_model,
    to_file="model.png",
    show_shapes=True,
    show_dtype=False,
    show_layer_names=True,
    rankdir="TD",
    dpi=180,
)

## Instrucciones de compilación

Estamos en el caso de una sola salida, ¿que pasaría si tomamos un problema de clasificación multiclase donde las clases serán los rangos salariales?

In [None]:
embedding_model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
embedding_model.summary()

In [None]:
embedding_model.fit(
    X_train_encajado, 
    y_train,
    #validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels), # Implementarlo a veces ayuda al modelo
    epochs=100,
    batch_size=53
)

## Observación de resultados

In [None]:
y_pred = embedding_model.predict(encaje(X_test_tensor))

In [None]:
y_train_pred = embedding_model.predict(X_train_encajado)

## MSE

En datos de entrenamiento

In [None]:
((np.array(y_train) - y_train_pred.reshape(1,-1))**2/len(y_pred)).sum()

En datos de prueba

In [None]:
((np.array(y_test) - y_pred.reshape(1,-1))**2/len(y_pred)).sum()

## Ejemplos de la salida del modelo

In [None]:
y_train[:10]

In [None]:
y_train_pred[:10]

In [None]:
y_test[:10]

In [None]:
y_pred[:10]