# Embeddings 

In [11]:
import numpy as np
from annoy import AnnoyIndex


class PreTrainedEmbeddings(object):

    def __init__(self, word_to_index, word_vectors):
        """
        Args:
            word_to_index (dict): mapping from word to integers
            word_vectors (list of numpy arrays)
        """
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        embedding_dim = len(word_vectors[0])
        self.index = AnnoyIndex(embedding_dim, metric="euclidean")
        for word, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
    @classmethod
    def from_embedding_file(cls, embedding_file):
        """
        Instantiate from pretrained vector file.

        VECTOR FILE SHOULD BE OF THE FORM:
        word0 x0_0 x0_1 x0_2 ... x0_N
        word1 x1_0 x1_1 x1_2 ... x1_N

        Args:
            embedding_file (str): location of the file

        Returns:
            PreTrainedEmbeddings instance
        """
        word_to_index = {}
        word_vectors = []
        with open(embedding_file, encoding="utf-8") as fp:
            for line in fp.readlines():
                line = line.split(" ")
                word = line[0]
                vec = np.array([float(x) for x in line[1:]])

                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)

        return cls(word_to_index, word_vectors)
    def get_embedding(self,word):
        """args:
        word(str)
            Returns an embeding(numyp):()nd array)
        """
        return self.word_vectors[self.word_to_index[word]]
    def get_closest_to_vector(self,vector,n=1):
        """Given a vector, return its n nearest neighbors
        args: Vector (np.ndarray): shouldmatch the size of the vectores in the Annoy index
        n (int): the number of neighbors to return
        Returns:
            [str, str , ..]: words nearest to the given vector the words are not ordere by distacne
        """
        nn_indices=self.index.get_nns_by_vector(vector,n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    def compute_and_print_analogy(self,word1,word2, word3):
        """
            prints the solutions to analogies using word embeddings 
            analogies are word1 is to word2 as word3 is to _
            this method will print : word1 word2 word3 : word4 
        Args:
            word1 (str)
            word2 (str)
            word3 (str)
        """
        vec1= self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)
        #Simple hypotesis: Analogy is a spatial relationship
        spatial_relationship = vec2- vec1
        vec4=vec3+ spatial_relationship

        closest_words=self.get_closest_to_vector(vec4,n=4)
        existing_words = set([word1,word2,word3])
        closest_words= [word for word in closest_words if word not in existing_words]

        if len(closest_words)==0:
            print("No se pudo encontrar palabras vecinas para el vector")
            return
        for word4 in closest_words:
            print(f"{word1}:{word2}:{word3}:{word4}:")

In [12]:
embeddings=PreTrainedEmbeddings.from_embedding_file("data/wiki_giga_2024_300_MFT20_vectors_seed_2024_alpha_0.75_eta_0.05_combined.txt")
embeddings.compute_and_print_analogy("man","he","woman")

man:he:woman:she:
man:he:woman:her:


In [13]:
embeddings.compute_and_print_analogy("fly","plane","sail")

fly:plane:sail:ship:
fly:plane:sail:vessel:
fly:plane:sail:boat:


In [14]:
embeddings.compute_and_print_analogy("cat","kitten","dog")

cat:kitten:dog:puppy:
cat:kitten:dog:rottweiler:
cat:kitten:dog:hound:


In [15]:
embeddings.compute_and_print_analogy("blue","color","dog")

blue:color:dog:pet:
blue:color:dog:taste:
blue:color:dog:cartoon:
blue:color:dog:introduces:


In [16]:
embeddings.compute_and_print_analogy("toe","foot","finger")

toe:foot:finger:inside:
toe:foot:finger:turned:
toe:foot:finger:apart:
toe:foot:finger:moving:


In [17]:
embeddings.compute_and_print_analogy("talk","communicate","read")

talk:communicate:read:instructions:
talk:communicate:read:translated:
talk:communicate:read:accurately:
talk:communicate:read:identify:


In [18]:
embeddings.compute_and_print_analogy("blue","democrat","red")

blue:democrat:red:democratic:
blue:democrat:red:senator:
blue:democrat:red:republican:


In [19]:
embeddings.compute_and_print_analogy("fast","fastest","young")

fast:fastest:young:youngest:
fast:fastest:young:sixth:
fast:fastest:young:fourth:
fast:fastest:young:fifth:


In [20]:
embeddings.compute_and_print_analogy("trans","lesbian","gay")

trans:lesbian:gay:lgbt:
trans:lesbian:gay:transgender:


In [23]:
embeddings.compute_and_print_analogy("science","computer","ai")

science:computer:ai:user:
science:computer:ai:lets:
science:computer:ai:programmed:
science:computer:ai:iphone:


## Continous Bag Of Words Embeddings

In [73]:
from torch.utils.data import Dataset,DataLoader
import pandas as pd
import numpy as np
class CBOWDataset(Dataset):
    # Se hereda la clase DataseT
    def __init__(self, cbow_df, vectorizer):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
            vectorizer (CBOWVectorizer): vectorizer instatiated from dataset
        """
        self.cbow_df = cbow_df
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, cbow_df.context))
        
        self.train_df = self.cbow_df[self.cbow_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.cbow_df[self.cbow_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')
    @classmethod
    def load_dataset_and_make_vectorizer(cls,cbow_cls):
        """
        carga el dataset y hacer un nuevo vectorizador desde cero
        args:
                cbow_cvs(str): ubicacion del dataset
        Returns:
            una instancia de ReviewDataset
        """
        cbow_df= pd.read_csv(cbow_cls)
        train_cbow_df=cbow_df[cbow_df.split=="train"]
        return cls(cbow_df,CBOWVectorizer.from_dataframe(train_cbow_df))
    def get_vectorizer(self):
        """retorna el vectorizador"""
        return self._vectorizer
    def set_split(self,split="train"):
        """
        Selecciona la division en el conjunto de datos usando una columna en el dataframe
        args:
        split(str):  uno de "train","val","test"
        """
        self._target_split=split
        self._target_df,self._target_size=self._lookup_dict[split]
    def __len__(self):
        return self._target_size
    def __getitem__(self,index):
        """
        El punto primario de entrada como metodo para PyTorch en lso conjuntos de datos
        Args:
        index(int): Es el inice del dato
        Returns:
            Diccionario de las caracterizitcads de los puntos de datos y labels
        """
        row=self._target_df.iloc[index]
        context_vector= self._vectorizer.vectorize(row.context,self._max_seq_length)
        target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)
        return {'x_data':context_vector,
               'y_data':target_index}
    def get_num_batches(self,batch_size):
        """
        Dado un tamaño de batch , retorna el numero de batches del conjunto de datos
        Args:
            batch_size(int)
        Returns:
            Numero de batches en el conjuntos
        """
        return len(self)//batch_size

## Vocabulario 

In [74]:
class Vocabulary(object):
    """ Clase a procesar texto y extrar el vocabulario para mappear"""
    def __init__(self,token_to_idx=None,mask_token="<MASK>",add_unk=True,unk_token="<UNK>"):
        """Args:
            token_to_idx (dict): un mapa pre existen de toknes a indices
            add_unk(bool): un bandera que indica si se añade el token UNK de desconocido
            unk_token(str): el token UNK se añade a el vocabulario
            """
        if token_to_idx is None:
            token_to_idx={}
        self._token_to_idx=token_to_idx
        self._idx_to_token={idx: token
                           for token,idx in self._token_to_idx.items()}
        self._add_unk=add_unk
        self._unk_token=unk_token
        self._mask_token=mask_token
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index=-1
        if add_unk:
            self.unk_index=self.add_token(unk_token)
    def to_serializable(self):
        """ Retorna un diccionario que puede ser serializado """
        return {"token_to_idx":self._token_to_idx,
               "add_unk":self._add_unk,
               "unk_token":self._unk_token,
               "mask_token":self._mask_token}
    @classmethod
    def from_serializable(cls,contents):
        """instancia el bocabulario desde un diccionario serializado"""
        return cls(**contents)
    def add_token(self,token):
        """
        Actualiza los mapeos de diccionarrios basados en los tokens
        Args:
            token (str): el item a añadir en el vocabulario
        Returns:
            index(int) : el entero correspondiente al token
        """
        if token in self._token_to_idx:
            index=self._token_to_idx[token]
        else:
            index=len(self._token_to_idx)
            self._token_to_idx[token]=index
            self._idx_to_token[index]=token
        return index
    def lookup_token(self,token):
        """ Obtiene el indice asociado con el tokenn de UNK token si el token no está presente,
        Args:
            token (str): el token a observar
        returns :
            index (int) el indices correspondiente al token
        Notes:
               `unk_index` necesita ser entero positivo (habiendo sido añadido al vocabulario) para la funcionalidad desconocidad
               """
        if self._add_unk:
            return self._token_to_idx.get(token,self.unk_index)
        else:
            return self._token_to_idx[token]
    def lookup_index(self,index):
        """ Obtiene el token asociado al indice
        Args:
            index (int): el indice a observar
        returns :
            token (str) el token correspondiendte al indices
        Raises:
            KeyError: si el indice no está en el vocabulario
        """
        if index not in self._idx_to_token:
            raise KeyError("El indice (%d)no está en el vocabulario" % index)
        return self._idx_to_token[index]
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    def __len__(self):#Refleja la longitudad del vocabulario
        return len(self._token_to_idx)

## El vectorizador

In [75]:
from collections import Counter
import string
class CBOWVectorizer(object):
    """ La clase vectorizer de las cuales, suss coordenadas son las del vocabulario"""
    def __init__(self,cbow_vocab):
        """
        Args:
            review_vocab(Vocabulary): mapea las palabras a los enteros
            rating_vocab (Vocabulary); Mapea las etiquetas de las clases a enteros
            """
        self.cbow_vocab= cbow_vocab
    def vectorize(self,context,vector_length=-1):
        """ 
        Crea un vecctor colapsado para la reseña
        Args:
            review(str) : una reseña
        Returns:
            one_hot (np.ndarray): la codificacion colapsadad 
        """
        indices= [self.cbow_vocab.lookup_token(token) for token in context.split(" ")]
        if vector_length<0:
            vector_length=len(indices)
        out_vector = np.zeros(vector_length,dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.cbow_vocab.mask_index
        return out_vector
    @classmethod
    def from_dataframe(cls,cbow_df):
        """
        Instancia el vectorizer desde el conjunto de datos del dataframe

        Args:
            review_df (pandas.Dataframe): el conjunto de datos de reseñas
            cuttof(int): el parametro para el filtro de basado en frecuencia
        Returns
        una instacia del ReviewVectorizer
        """
        cbow_vocab=Vocabulary()
        #Añada la palabras top if count>providad count
        words_counts=Counter()
        for index, row in cbow_df.iterrows():
            for token in row.context.split(" "):
                cbow_vocab.add_token(token)
            cbow_vocab.add_token(row.target)
        return cls(cbow_vocab)
    @classmethod 
    def from_serializable(cls,contents):
        """
        Instancia una ReviewCectorizer desde un diccionario serializavle
        Args:
                contents(dicT): el diccionario serializable
        Returns:
            Una instancia de ReviewVectorizer class
        """
        cbow_vocab=Vocabulary.from_serializable(contents["cbow_vocab"])
        return cls(cbow_vocab)
    def to_serializable(self):
        """Crea un diccionario seralizable para el chace
        Returns:
            contents(ditc): el diccionario serializable
        """
        return {"review_vocab": self.review_vocab.to_serializable(),
               "rating_vocab": self.rating_vocab.to_serializable()}

In [76]:
import torch.nn as nn
import torch.functional as F
import torch
import torch.optim as optim
class CBOWClassifier(nn.Module):
    def __init__(self,vocabulary_size,embedding_size, padding_idx=0):
        """
            Args:
            vocabulary_size (int): number of vocabulary items, controls the number of embeddings and prediction vector size
            embedding_size (int). size of the embeddings
            padding_idx (int: default 0; Embedg¿ding will not use this index
            """
        super(CBOWClassifier,self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size,
                                        embedding_dim=embedding_size,
                                        padding_idx=padding_idx)
        self.fc1 = nn.Linear(in_features=embedding_size,
                                out_features=vocabulary_size)
    def forward(self,x_in, apply_softmax=False):
        """The forward pass of the classifier
        Args: 
            x_in (torch.Tensor):  an input data tensor
        x_in.sape should be (batch,input_dim)
        apply_softmax (bool): a flag for the softmax activation should be false if used with the cross entropy lossess
        Returns:
        The resulting tensor, tensor.shape should be (batch,output_dim)
        """
        x_embedded_sum = self.embedding(x_in).sum(dim=1)
        y_out= self.fc1(x_embedded_sum)
        if apply_softmax:
            y_out= F.softmax(y_out,dim=1)
        return y_out

In [77]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            }
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100
def generate_batches(dataset,batch_size,shuffle=True,
                     drop_last=True,device="cpu"):
    """Una funcion generation la que wrapea ek dataloader de pytorxh. SE asegurarar de que cada tesnor este en un unico dispositivo
    """
    dataloader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [78]:
from argparse import Namespace
args = Namespace(
    # Data and Path information
    cbow_csv="data/books/frankenstein_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch5/cbow",
    # Model hyper parameters
    embedding_size=300,
    # Training hyper parameters
    seed=1337,
    num_epochs=100,
    learning_rate=0.0001,
    batch_size=128,
    early_stopping_criteria=5,
    # Runtime options
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

# Entrenamiento

In [83]:
train_state=make_train_state(args)
if not torch.cuda.is_available():
    args.cuda=False
args.device= torch.device("cuda" if args.cuda else "cpu")
#Dataset and vectorizer
dataset=CBOWDataset.load_dataset_and_make_vectorizer(args.cbow_csv)
vectorizer=dataset.get_vectorizer()
#Model
classifier = CBOWClassifier(vocabulary_size=len(vectorizer.cbow_vocab),embedding_size=args.embedding_size)
classifier = classifier.to(args.device)

#Loss and optimizer
loss_func= nn.CrossEntropyLoss()
optimizer=optim.Adam(classifier.parameters(),lr=args.learning_rate)

In [85]:
for epoch_index in range(args.num_epochs):
    train_state['epoch_index']=epoch_index
    # Itera sobre nuestro dataset
    #Configura: el generador de batch, la perdidad, el accuracy a 0 y el conjunto de entrenamiento a activo
    dataset.set_split("train")
    batch_generator=generate_batches(dataset,batch_size=args.batch_size,device=args.device)
    running_loss=0.0
    running_acc=0.0
    classifier.train()
    for batch_index,batch_dict in enumerate(batch_generator):
        
        # La rutina de entrenamiento se conforma de 5 pasos

        #paso 1, los gradientes en cero
        optimizer.zero_grad()
        #Paso . calcula la salida
        y_pred=classifier(x_in=batch_dict["x_data"])
        #paso 3, calcula la perdidad
        loss=loss_func(y_pred,batch_dict["y_data"])
        loss_batch=loss.item()
        running_loss+=(loss_batch-running_loss)/(batch_index+1)
        #paso 4, usa la perdidad para producir el gradiente
        loss.backward()
        #Paso 5, usa el otpimizar para que tomo el paso gradiente
        optimizer.step()

        ##Calcula el accuracy
        acc_batch=compute_accuracy(y_pred,batch_dict["y_data"])
        running_acc+=(acc_batch-running_acc)/(batch_index+1)
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)

    #Iterar sobre el dataset de validacion
    #Configura, el generator de batch, la perdiddad, el accuracy a 0 y modo eval
    dataset.set_split("val")
    batch_generator=generate_batches(dataset,batch_size=args.batch_size,device=args.device)
    running_loss=0.0
    running_acc=0.0
    classifier.eval()
    #Empieza la evaluacion
    for batch_index,batch_dict in enumerate(batch_generator):
        # Paso 1: calcula la salida
        y_pred=classifier(x_in=batch_dict["x_data"])
        # paso 2, calcula la salida
        loss = loss_func(y_pred,batch_dict['y_data'])
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss)/(batch_index+1)
        #paso 3, calcula el accuracy
        acc_batch= compute_accuracy(y_pred,batch_dict['y_data'])
        running_acc+=(acc_batch-running_acc)/(batch_index+1)
    train_state["val_loss"].append(running_loss)
    train_state["val_acc"].append(running_acc)

In [86]:
dataset.set_split("test")
batch_generator=generate_batches(dataset,batch_size=args.batch_size,device=args.device)
running_loss=0.
running_acc=0.
classifier.eval()
print("comienza la evaluacion")
for batch_index,batch_dict in enumerate(batch_generator):
    print("batch: ",batch_index)
    #Compute the output
    y_pred=classifier(x_in=batch_dict["x_data"])
    # calcula la perdidad
    loss = loss_func(y_pred,batch_dict["y_data"])
    loss_batch= loss.item()
    running_loss += (loss_batch-running_loss)/(batch_index+1)
    #Calcula el accuracy 
    acc_batch= compute_accuracy(y_pred,batch_dict["y_data"])
    running_acc+=(acc_batch-running_acc)/(batch_index+1)
train_state["test_loss"]= running_loss
train_state["test_acc"] = running_acc

comienza la evaluacion
batch:  0
batch:  1
batch:  2
batch:  3
batch:  4
batch:  5
batch:  6
batch:  7
batch:  8
batch:  9
batch:  10
batch:  11
batch:  12
batch:  13
batch:  14
batch:  15
batch:  16
batch:  17
batch:  18
batch:  19
batch:  20
batch:  21
batch:  22
batch:  23
batch:  24
batch:  25
batch:  26
batch:  27
batch:  28
batch:  29
batch:  30
batch:  31
batch:  32
batch:  33
batch:  34
batch:  35
batch:  36
batch:  37
batch:  38
batch:  39
batch:  40
batch:  41
batch:  42
batch:  43
batch:  44
batch:  45
batch:  46
batch:  47
batch:  48
batch:  49
batch:  50
batch:  51
batch:  52
batch:  53
batch:  54
batch:  55
batch:  56
batch:  57
batch:  58
batch:  59
batch:  60
batch:  61
batch:  62
batch:  63
batch:  64
batch:  65
batch:  66
batch:  67
batch:  68
batch:  69
batch:  70
batch:  71
batch:  72
batch:  73
batch:  74
batch:  75
batch:  76
batch:  77
batch:  78
batch:  79
batch:  80
batch:  81
batch:  82
batch:  83
batch:  84
batch:  85
batch:  86
batch:  87
batch:  88
batch:  

## Este modelo, solo tiene el detalle de infereir la siguiente palabra
> No tiene buen accuracy, debido a que e sun libro con solo 70000 palabras, y los embeddings desde 0 se hacen con cientos de teras, y es necesario, para que un modelo