# Proyecto final

In [None]:
#Autores: Daniel Castillo, Karla Salas con ayuda del profesor Mijangos
from os import listdir
from os.path import isfile, join
#Para ver las palabras
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.model_selection import train_test_split #particiones
from nltk.corpus import stopwords #Listas de stopwords
from nltk.tokenize import word_tokenize, sent_tokenize #Tokens
import re #regex
from itertools import chain #bigramas
import numpy as np
from operator import itemgetter
import pickle # Guardar objetos
from tqdm import tqdm #Medir el progreso del entrenamiento

Clase Bengio: para la red neuronal

In [None]:
class Bengio:
    '''
    Aplica la arquitectura de Bengios
    
    Args:
        bigrams (list): Lista de bigramas por oración en el corpus
        voc (dic): Diccionario de palabras con su índice asociado
        dim (int): Unidades ocultas
        nn_hdim (int): Unidades en la segunda capa
    '''
    def __init__(self, bigrams, voc, dim, nn_hdim):
        np.random.seed(0)
        self.bigrams = bigrams
        self.voc = voc
        # unidades de la capa oculta
        self.dim = dim
        # unidades de la segunda capa
        self.nn_hdim = nn_hdim
        N = len(voc)
        #Embedding (este vector se guarda para la siguiente tarea)
        self.C = np.random.randn(dim, N) / np.sqrt(N)
        #U (a |V | × h matrix) - hidden-to-output weights
        self.U = np.random.randn(nn_hdim, dim) / np.sqrt(dim)
        self.b = np.zeros((1, self.nn_hdim)) #bias
        # W (a |V | × (n − 1)m matrix) word features to output weights
        self.W = np.random.randn(N, nn_hdim) / np.sqrt(nn_hdim)
        self.c = np.zeros((1, N))
    
    def train(self, its, eta):
        '''
        Entrena la red de Bengios
        Obtiene la probabilidad de transitar de una palabra a otra

        Args:
            its (int): Iteraciones
            eta (int): radio de aprendizaje
        '''
        for i in tqdm(range(0,its)):
            for ex in self.bigrams:
                #Forward
                f, a = self.forward(ex[0])
                #Backward, pasos descritos en el paper
                #Variable de salida, (a).1
                d_out = f
                d_out[ex[1]] -= 1
                #Variable para la capa oculta
                d_tanh = (1-a**2)*np.dot(self.W.T,d_out)
                #Variable de embedding
                d_emb = np.dot(self.U.T, d_tanh)
                #Actualizacion de salida
                self.W -= eta*np.outer(d_out,a)
                #Actualiza bias de salida
                self.c -= eta*d_out #[j]
                #Actualizacion de capa oculta
                self.U -= eta*np.outer(d_tanh,self.C.T[ex[0]])
                #Actualiza bias
                self.b -= eta*d_tanh
                #Actualizacion de embedding
                self.C.T[ex[0]] -= eta*d_emb

    def forward(self, x): 
        '''
        Etapa forward de la red 
        sirve para entrenar y evaluar el modelo

        Args:
            x (str): Palabra a calcular la probabilidad dado un contexto
        '''
        x = self.voc[x]
        #Embedimiento
        x = self.C.T[x] #x(k) ← C(wt−k)
        #capa oculta
        #a ← tanh(Hx + d)
        a = np.tanh(np.dot(self.U, x) + self.b)[0]
        #salida
        # p_j ← e**(a.U + b_j) 
        # if (direct connections) e**(e**(a.U + b_j) + x.W_j)
        out = np.exp(np.dot(self.W, a) + self.c)[0]
        #Softmax
        # Normalize the probabilities
        self.p = out/out.sum(0)
        return self.p, a

    def plot_words(self, ids):
        '''
        Muestra los embedings utilizando PCA

        Args:
            ids (int): valor númerico de la palabra
        '''
        Z = PCA(2).fit_transform(self.C.T[:-2])
        plt.figure(figsize=(10,6))
        plt.scatter(Z[:,0],Z[:,1], marker='.')
        for label,x,y in zip(ids, Z[:,0], Z[:,1]):
            plt.annotate(label, xy=(x,y), xytext=(-1,1), 
                         textcoords='offset points', 
                         ha='center', va='bottom')
        plt.show()

    def prob_sentence(self, sentence):
        '''
        Obtenemos la probabilidad de la oración

        Args:
            sentence (list): lista de las palabras que componen la oración
        '''
        #Obtenemos los bigramas de la cadena de evaluacion
        bigrams = list(zip(sentence,sentence[1:]))
        p = 1
        #Multiplicamos por las probabilidades de los bigramas dado el modelo
        for gram1, gram2 in bigrams:
            #Obtiene las probabilidades de transición
            try:
                prev_prob = self.forward(gram1)[0]
            except:
                prev_prob = self.forward('<oov>')[0]

            try:
                p *= prev_prob[gram2]
            except:
                p *= prev_prob['<oov>']
                
        return p 

    def get_entropy(self, test_data):
        '''
        Obtenemos la entropia promedio del modelo

        Args:
            test_data (list): conjunto de prueba tokenizado
        '''
        H = 0.0
        # calculamos entropia como el promedio de las probabilidades de cada oración
        for sentence in tqdm(test_data):
            #Probabilidad de la cadena
            p_cad = self.prob_sentence(sentence)
            #Longitud de la cadena
            M = len(sentence)
            #Obtenemos la entropía cruzada de la cadena
            if p_cad != 0:
                H -= (1./M)*(np.log(p_cad)/np.log(2))

        return H/len(test_data)
    
    def test(self, test):
        '''
        Probamos el modelo

        Args:
            test (list): conjunto de prueba tokenizado
        '''
        entropy = self.get_entropy(test)
        perplexity = 2**entropy
        return entropy, perplexity

    def save_embedings(self, path):
        '''
        Guardamos en diccionario (palabra: embeding) en un
        archivo

        Args:
            path (str): ruta para guardar el archivo
        '''
        embedings = {}
        for word in self.voc.keys():
            embedings[word] = self.C.T[self.voc[word]]

        pickle.dump(embedings, open(path, 'wb'))

    def save_model(self, path):
        """
        Para guardar el modelo ya entrenado en un archivo

        Args:
            path (str): ruta para guardar el archivo
        """
        pickle.dump(self, open(path, 'wb'))

Obtenemos modelos

In [None]:
# Comprobamos que se guardó correctamente
bengio_full = pickle.load(open('./modelAll/model.pkl', 'rb'))
bengio_10 = pickle.load(open('./model10%/model.pkl', 'rb'))

Funciones para generar palabras y cadenas

In [None]:
#Función para obtener las probabilidades ordenadas de mayor a menor
def get_ordered_probs(word):
    probs, a = bengio_full.forward(word)
    dic_probs = dict(zip(bengio_full.voc.keys(), probs))
    dic_probs.pop('<oov>')
    return sorted(dic_probs.items(), key=itemgetter(1), reverse=True)

#Función que genera una palabra siguiente
def next_word(string):
    #Obtener la última palabra en la historia
    last_w = string.split()[-1]
    #Obtener una palabra en base a la distribución
    selection = np.random.choice(range(4), 1, p=None)[0]
    print(selection)
    max_w = get_ordered_probs(last_w)[selection]
    #max_w = get_ordered_probs(last_w)[0]
    
    return max_w[0]

more = ["_","-","'ve", "'ll", "'t", "'s", "'re", "'", "'m", "'d", "n't", "oh", "hey", "yeah","okay", "mr.", "miss", "mrs."]
stopwords_list = stopwords.words('english') + more

def get_query_clean(text):
    '''
    Genera los tokens de una cadena y los limpia
    (quita símbolos raros y stopwords)
    
    Args:
        text (str): cadena
    '''
    tokens = word_tokenize(text)
    clean = []
    pattern = r'[^a-z0-9\s]'
    for w in tokens:
        #quita stopwords y convierte a minúsculas
        w = re.sub(pattern,'', w.lower())
        if w not in stopwords_list and w != '':
            if  w == "na": #Para juntar gon na, wan na, etc.
                clean[-1] += w
            else:
                clean.append(w)

    return clean

#Función que genera cadena
def generate(string):
    # limpiamos entrada
    string = ' '.join(get_query_clean(string))
    #Guarda la palabra predicha
    w = ''
    #Guarda la cadena que se ha generado
    str_gen = string
    #El método se detiene al ver <EOS>
    t = 0
    while w != '<EOS>':
        #Predice la siguiente palabra
        w = next_word(str_gen)
        #Agrega esa palabra a ala cadena
        str_gen += ' ' + w
        t += 1
        if t == 10:
            w = '<EOS>'
    
    str_gen = str_gen.replace('<BOS>', '. ')
    #Regresa la cadena si el símbolo EOS
    return str_gen[:len(str_gen)]

A probar

In [None]:
generate('Stephen Strange')

Ahora usaremos una red usando Tensorflow

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras import layers

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
def get_txt(path):
    """
    Regresa una lista con el contenido de todos los archivos de un directorio

    Args:
        path (str): ruta de la carpeta
    """
    text = ''
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    for file in onlyfiles:
        with open(path+"/"+file, 'rb') as f:
            text += f.read().decode('utf-8').lower()
    return text

# Guardamos cada película en un diccionario
# cada entrada del diccionario es una lista con las peliculas leídas
corpus = ''
corpus += get_txt("../corpus/Pride & Prejudice")
corpus += get_txt("../corpus/Marvel")
corpus += get_txt("../corpus/Christopher Nolan")

In [None]:
corpus = pickle.load(open('./corpus.pkl', 'rb'))
corpus = [x for s in corpus for x in s]
corpus = ' '.join(corpus)
corpus

In [None]:
tokenizer = tf_text.UnicodeScriptTokenizer()
movies_tokens =  tokenizer.tokenize([corpus]).to_list()[0]
movies_tokens[:10]

In [None]:
words_ds = tf.data.Dataset.from_tensor_slices(movies_tokens)

In [None]:
for words in words_ds.take(20):
    print(words.numpy())

In [None]:
seq_length = 50
words_batches = words_ds.batch(seq_length+1, 
                               drop_remainder=True)

for words in words_batches.take(1):
    print(words.numpy())

In [None]:
def join_strings(tokens):
    return tf.strings.reduce_join(tokens, axis=0, separator=' ')

In [None]:
raw_train_ds = words_batches.map(join_strings)
batch_size = 32
BUFFER_SIZE = len(raw_train_ds)

raw_train_ds = (
    raw_train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
for batch in raw_train_ds.take(1):
    print(batch)

In [None]:
voc_size = len(list(set(movies_tokens)))
print(voc_size)


In [None]:
voc_size = 11994

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=voc_size - 1,
    output_mode='int',
    output_sequence_length=seq_length + 1,
    #split='character'
)

vectorize_layer.adapt(raw_train_ds)
vocab = vectorize_layer.get_vocabulary()
len(vocab)

In [None]:
vectorize_layer(['Love you', '3 millions'])

In [None]:
def get_input_target(text):
    tokenized_text = vectorize_layer(text)
    input_text = tokenized_text[:, :-1]
    target_text = tokenized_text[:, 1:]
    return input_text, target_text

In [None]:
train_ds = raw_train_ds.map(get_input_target)

In [None]:
for input_batch, target_batch in train_ds.take(1):
    print(input_batch.shape, target_batch.shape)
    print(input_batch[0], target_batch[0])

Definir modelo

In [None]:
emb_dim = 256
model_dim = 1024

In [None]:
class RNN(tf.keras.Model):
    def __init__(self, voc_size, emb_dim, model_dim):
        super().__init__(self)
        self.embedding = layers.Embedding(voc_size, emb_dim)
        self.gru = layers.GRU(model_dim,
                              return_sequences=True,
                              return_state=True)
        self.logits = layers.Dense(voc_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.logits(x, training=training)

        if return_state:
            return x, states
        else:
            return x 

model = RNN(voc_size=voc_size,
            emb_dim=emb_dim,
            model_dim=model_dim)

In [None]:
for input_batch, target_batch in train_ds.take(1):
    predictions = model(target_batch)
    print(predictions.shape, target_batch.shape)

In [None]:
model.summary()

In [None]:
predictions[0].shape

In [None]:
pred_indices = tf.random.categorical(predictions[0], num_samples=1)
pred_indices[:, 0]

Obtener palabras a travez de indices con vocab

In [None]:
' '.join([vocab[_] for _ in input_batch[0]])

In [None]:
' '.join([vocab[_] for _ in pred_indices[:, 0]])

# Entrenamiento

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_metric = tf.keras.metrics.Mean(name='loss')

In [None]:
@tf.function
def train_step(input_batch, target_batch):
    with tf.GradientTape() as tape:
        logits = model(input_batch, training=True)
        loss_value = loss(target_batch, logits)

    gradients = tape.gradient(loss_value, model.trainable_weights)
    opt.apply_gradients(zip(gradients, model.trainable_weights))
    loss_metric(loss_value)

In [None]:
epochs = 1

In [None]:
for epoch in range(epochs):
    for input_batch, target_batch in train_ds:
        train_step(input_batch, target_batch)
        
    print(f'Epoch: {epoch} Loss: {loss_metric.result().numpy()}')
    loss_metric.reset_states()

# Guardamos el modelo

In [None]:
super(model.__class__, model).save('./modelTensor/model_intent1.h5')

In [None]:
model2 = tf.keras.models.load_model('./modelTensor/model_intent1')

#Generación

In [None]:
states = None
start = 'tony stark'
context = tf.constant([start])
output = [start]

for i in range(50):
    #print(vectorize_layer(context)[:, :1])
    # Obtener solo el primer elemento que regresa vectorize_layer
    pred_logits, states = model(vectorize_layer(context)[:, :1], 
                                states=states, return_state=True)
    #print(pred_logits.shape)
    pred_index = tf.random.categorical(pred_logits[:, -1, :], 
                                       num_samples=1)

    #print(vocab[pred_index[0, 0]])
    context = tf.constant([vocab[pred_index[0, 0]]])
    output.append(vocab[pred_index[0, 0]])
    
' '.join(output)