$$\textbf{PLN. Práctica 5: Bengio Model}$$
$$\textit{Y. Sarahi García Gozález}$$

<font size=4.5 color='lightblue'>

$\textit{Librerías}$

In [84]:
#tools
import os
import time
import shutil
import random
from typing import Tuple 
from argparse import Namespace #objeto que me ayude a guardar espacio de variables
import matplotlib.pyplot as plt
import string

#preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk import FreqDist
from nltk import FreqDist
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd

#pyTorch
import torch
from torch.utils.data import dataloader,TensorDataset
import torch.nn as nn
import torch.nn.functional as F

#sckit-learns
import sklearn
from sklearn.metrics import accuracy_score


In [85]:
print("Tarea realizada en MacOs. \nLas versiones de las librerías y de python utilizadas fueron:\n")
from platform import python_version
print("Python version:", python_version())
print("NumPy version:", np.__version__)
print("NLTK version:", nltk.__version__)
print("Pandas version:", pd.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Pythorch version:",torch.__version__)

Tarea realizada en MacOs. 
Las versiones de las librerías y de python utilizadas fueron:

Python version: 3.11.0
NumPy version: 1.23.5
NLTK version: 3.8.1
Pandas version: 2.1.4
Scikit-learn version: 1.3.0
Pythorch version: 2.2.1


$\textit{Preparación de texto, corpus y diccionarios}$

In [86]:
#Buscamos que los resultados sean reproducibles por lo que
#definimos una semillas constantes:
seed=111
random.seed(seed) #python seed
np.random.seed(seed) #numpy seed
torch.manual_seed(seed) #torch seed
torch.backends.cudnn.benchmark= False

In [87]:
#convertimos el texto de train a una lista de tuits
X_train=pd.read_csv("/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/practicas/03_practica/mex20_train.txt",sep='\r\n',engine='python',header=None).iloc[:,0].values.tolist()
#convertimos el texto de validacion a una lista de tuits
X_val=pd.read_csv("/Users/ely/Documents/Maestria/segundo_semestre/cimat2023-1/lenguaje/practicas/03_practica/mex20_val.txt",sep='\r\n',engine='python',header=None).iloc[:,0].values.tolist()


In [88]:
#creamos espacio de parámetros
args= Namespace()
args.N=4 #primera variable: odelo de lenguaje de tretagramas

In [97]:
class NgramData():
        
      ## Constructor ##
      def __init__(self, N:int,vocab_max:5000,tokenizer=None,embeddings_model=None):
            
            #tokenizador
            self.tokenizer = tokenizer if tokenizer else self.default_tokenizer #en caso de no recibir tokenizador, creamos un por default
            #signos de puntuación
            self.punct=set((string.punctuation + '¡¿«»'))# (agregamos signos en español:¡¿«» )
            self.punct.add('@USUARIO')
            self.punct.add('@usuario')
            #orden del modelo
            self.N=N
            #vocabulario maximo
            self.vocab_max=vocab_max
            #tokens especiales
            self.UNK="<unk>"
            self.SOS="<s>"
            self.EOS="<\s>"
            #vectores preentrenados
            self.embeddings_mode=embeddings_model

        ## Métodos ##
      def default_tokenizer(self,doc: str)->list: #si no se da un tokenizador definimos uno por default
            return doc.split(" ") 

      def remove_word(self,word: str)->bool:
            word=word.lower()
            is_punct = True if word in self.punct else False
            is_digit=word.isnumeric()
            return is_digit or is_punct

      def sortFreqDict(self,freq_dist)-> list:
            freq_dict=dict(freq_dist)
            return sorted(freq_dict,key=freq_dict.get,reverse=True)

      def get_vocab(self,corpus:list)-> set:
            Freq_Dist=FreqDist([w.lower() for sentence in corpus for w in self.tokenizer(sentence) if not self.remove_word(w)])
            sorted_words=self.sortFreqDict(Freq_Dist)[:self.vocab_max-3]
            return(set(sorted_words))

      def fit(self,corpus:list)-> None: #esta funcion extrae el vocabulario
              
            self.vocab=self.get_vocab(corpus)
            self.vocab.add(self.UNK)
            self.vocab.add(self.SOS)
            self.vocab.add(self.EOS)

             #construimos los diccionarios de mapeo   
            self.w2id = {}
            self.id2w = {}
            if self.embeddings_mode is not None:
                  self.embeddings_matrix=np.empty([len(self.vocab),self.embeddings_mode.vector_size])
            id=0
            for doc in corpus:
                  for word in self.tokenizer(doc):
                        word_=word.lower()
                        if word_ in self.vocab and not word_ in self.w2id:
                              self.w2id[word_]=id
                              self.id2w[id] = word_

                              if self.embeddings_mode is not None:
                                  if word_ in self.embeddings_mode:
                                      
                                      self.embeddings_matrix[id] = self.embeddings_mode[word_]
                                  else:
                                        self.embeddings_matrix[id] = np.random.rand(self.embeddings_mode.vector_size)

                              id += 1

            #siempre hay que agregar los tokens especiales
                              
            self.w2id.update(
                  {
                        self.UNK:id,
                        self.SOS:id+1,
                        self.EOS:id+2
                  }
            )
            self.id2w.update(
                  {
                        id:self.UNK,
                        id+1:self.SOS,
                        id+2:self.EOS
                  }
            )

      def get_ngram_doc(self,doc:list) -> list:
            doc_tokens = self.tokenizer(doc)
            doc_tokens = self.replace_unk(doc_tokens)
            doc_tokens = [w.lower() for w in doc_tokens]
            doc_tokens = [self.SOS]*(self.N - 1) + doc_tokens + [self.EOS]
            return list(ngrams(doc_tokens,self.N))

      def replace_unk(self,doc_tokens:list)-> list:
            for i,token in enumerate(doc_tokens):
                  if token.lower() not in self.vocab:
                        doc_tokens[i]=self.UNK
            return doc_tokens
            

      def transform(self,corpus:list)->Tuple[np.ndarray,np.ndarray]:

            x_ngrams = []
            y_labels = []

            for doc in corpus:
                  doc_ngram =self.get_ngram_doc(doc)
                  for words_window in doc_ngram:
                        words_window_ids= [self.w2id[w] for w in words_window]
                        x_ngrams.append(list(words_window_ids[:-1]))
                        y_labels.append(words_window_ids[-1])

            return np.array(x_ngrams),np.array(y_labels)




In [99]:
#Tokenizamos
tokenizer=TweetTokenizer()
ngram_data=NgramData(args.N,5000,tokenizer.tokenize)
ngram_data.fit(X_train)

In [108]:
x_ngram_train,y_ngram_train=ngram_data.transform(X_train)
x_ngram_val,y_ngram_val=ngram_data.transform(X_val)

In [112]:
[[ngram_data.id2w[w] for w in tw] for tw in x_ngram_train[:22]]

[['<s>', '<s>', '<s>'],
 ['<s>', '<s>', '<unk>'],
 ['<s>', '<unk>', '<unk>'],
 ['<unk>', '<unk>', '<unk>'],
 ['<unk>', '<unk>', 'q'],
 ['<unk>', 'q', 'se'],
 ['q', 'se', 'puede'],
 ['se', 'puede', 'esperar'],
 ['puede', 'esperar', 'del'],
 ['esperar', 'del', 'maricon'],
 ['del', 'maricon', 'de'],
 ['maricon', 'de', 'closet'],
 ['de', 'closet', 'de'],
 ['closet', 'de', 'la'],
 ['de', 'la', 'yañez'],
 ['la', 'yañez', 'aun'],
 ['yañez', 'aun', 'recuerdo'],
 ['aun', 'recuerdo', 'esa'],
 ['recuerdo', 'esa', 'ves'],
 ['esa', 'ves', 'q'],
 ['ves', 'q', 'lo'],
 ['q', 'lo', 'vi']]

#