# Notas sobre los programas y eficiencia

[Acá](https://www.codementor.io/@satwikkansal/python-practices-for-efficient-code-performance-memory-and-usability-aze6oiq65) hay un link muy interesante sobre eficiencia en python.

Me gustaría pensar en cosas para acelerar mis programas, y para eso tendría que saber cosas de programación como eficiencia y esas cosas de informáticos.


## El corpus

Un corpus de texto es un conjunto de documentos que, a su vez, son una secuencia de tokens. Por ejemplo:

```Python
[['This', 'is', 'the', 'pencil', 'of', 'Esther', 'Píscore', '.'],
 ['<START>','Duerma', ',', 'Don', 'Rodrigo','<END>']]
```

Siempre puedo representar al corpus como una lista de listas de strings.

Un corpus puede ser obtenido de un archivo csv, una string, un dataframe, un archivo txt, o de donde fuere, pero siempre va a tener la misma forma. En general, para obtener un corpus hay que hacer un proceso de "limpieza" de texto (tokenization, lemmatization, stemmization y esas cosas) que no viene al caso ahora, pero que siempre me termina dando un objeto con esas características. 

Un corpus puede ser un conjunto de ejemplos de un campo, como cuando hago text classification, o un conjunto de documentos preparados para entrenar word vectors, entre otras cosas.

Por otra parte, yo voy a querer, junto con el corpus, un conjunto de instrucciones para transformar el corpus en un conjunto de muestras vectorizadas que pueda ingresar a mi modelo para entrenarlo. Para eso necesito en el 99.9% de los casos un vocabulario, es decir, un mapeo de tokens a índices que me permite identificar, posteriormente, cada token (único) con un vector.


In [None]:
class Corpus(object):
    
    token_sep = '<TS>'
    doc_sep = '<DS>'
    default_tokenizer = lambda s: s.split(' ')
    
    def __init__(self, data):
        self.vocabulary = Vocabulary.from_list_corpus(data)
        self.docs_num = len(data)
        self.tokens_num = sum([len(doc) for doc in data])
        self.data = [[self.vocabulary.token_to_index(tk) for tk in doc] for doc in data]
        
        self.max_idx = self.tokens_num
    
    @classmethod
    def from_binary_files(cls, filenames, decode='utf-8', tokenizer=None):
        if tokenizer is None:
            tokenizer = self.default_tokenizer
        texts_list = []
        if isinstance(filenames, list):
            for filename in filenames:
                with open(filename, 'rb') as file:
                    texts_list.append(self.token_sep.join(tokenizer(file.read().decode(decode))))
        elif isinstance(filenames, str):
            with open(filenames, 'rb') as file:
                texts_list.append(self.token_sep.join(tokenizer(file.read().decode(decode))))
        data = self.doc_sep.join(texts_list)
        return cls(data)
    
    @classmethod
    def from_text_files(cls, filenames, tokenizer=None):
        if tokenizer is None:
            tokenizer = self.default_tokenizer
        texts_list = []
        if isinstance(filenames, list):
            for filename in filenames:
                with open(filename, 'r') as file:
                    texts_list.append(self.token_sep.join(tokenizer(file.read().decode(decode))))
        elif isinstance(filenames, str):
            with open(filenames, 'r') as file:
                texts_list.append(self.token_sep.join(tokenizer(file.read().decode(decode))))
        data = self.doc_sep.join(texts_list)
        return cls(data)
    
    @classmethod
    def from_strings(cls, texts, tokenizer=None):
        if tokenizer is None:
            tokenizer = self.default_tokenizer
        texts_list = []
        if isinstance(filenames, list):
            for text in texts:
                texts_list.append(self.token_sep.join(tokenizer(text)))
        elif isinstance(filenames, str):
            texts_list.append(self.token_sep.join(tokenizer(text)))
        data = self.doc_sep.join(texts_list)
        return cls(data)
    
    @classmethod
    def from_csv_file(cls, filename, fieldname, tokenizer=None, **kwargs):
        ds = pd.read_csv(filename, **kwargs)[fieldname]
        data = self.doc_sep.join(ds.str.apply(tokenizer).str.join(self.token_sep).tolist())
        return cls(data)
    
    def __repr__(self):
        return "Corpus object\nNumber of docs = {}\nNumber of tokens = {}".format(self.docs_num, self.tokens_num)
    
    def __str__(self):
        printed_text = ''
        num_print_docs = min(self.docs_num,5)
        unk_token_idx = self.vocabulary.max_idx
        for i in range(num_print_docs):
            doc = self.data[i]
            if len(doc) <= 5:
                printed_text += repr([self.vocabulary.index_to_token(idx) if idx != unk_token_idx \
                                      else self.vocabulary.unk_token for idx in doc]) 
            else:
                printed_text += repr([self.vocabulary.index_to_token(idx) if idx != unk_token_idx \
                                      else self.vocabulary.unk_token for idx in doc[:4]])[:-1] + ', ...]'
            if i < num_print_docs:
                printed_text += '\n'
        if num_print_docs != self.docs_num:
            printed_text += '...'
        return printed_text

    def __len__(self):
        return self.tokens_num
    
    def __getitem__(self,tk_or_idx):
        if isinstance(tk_or_idx, int):
            return self.data[tk_or_idx]
        if isinstance(tk_or_idx, str):
            return [i for doc in self.data for i, tk in enumerate(doc)]
        raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        
    def __iter__(self):
        return (self.vocabulary.index_to_token(idx) for doc in self.data for idx in doc)
    
    def __contains__(self,key):
        return key in self.vocabulary

In [None]:
def vectorize_word_context(corpus, vocabulary, left_window=2, right_window=2, split_contexts=0):
    unk_token_idx = len(vocabulary)
    context_size = left_window + right_window
    words = []
    contexts = []
    for doc in corpus:
        for i in range(left_window):
            doc.insert(0,unk_token_idx)
        for i in range(right_window):
            doc.append(unk_token_idx)
        for i, idx in enumerate(doc[left_window:-right_window],left_window):
            words.append(idx)
            contexts.append(doc[i-left_window:i] + doc[i+1:i+right_window+1])
        for i in range(left_window):
            doc.pop(0)
        for i in range(right_window):
            doc.pop(-1)

    print(words)
    print(contexts)
    if split_contexts == 0:
        words = torch.tensor(words)
        contexts = torch.tensor(contexts)
        mask = (words != unk_token_idx) * (contexts != unk_token_idx).any(dim=1)
    elif split_contexts == -1:
        words = torch.tensor(words).view(-1,1).repeat(1,context_size).view(-1)
        contexts = torch.tensor(contexts).view(-1)
        mask = (words != unk_token_idx) * (contexts != unk_token_idx)
    elif split_contexts < 0:
        raise RuntimeError('El tamaño del contexto debe ser positivo o igual a -1')
    elif context_size % split_contexts == 0:
        words = torch.tensor(words).view(-1,1).repeat(1,context_size // split_contexts).view(-1)
        contexts = torch.tensor(contexts).view(-1,split_contexts)
        mask = (words != unk_token_idx) * (contexts != unk_token_idx).any(dim=1)
    else:
        raise RuntimeError('Los tamaños de los contextos deben ser iguales')

    words = words[mask]
    contexts = contexts[mask]
    return words, contexts
    
corpus = [['w1', 'w2', 'w3'],['w1', 'w2', 'w3', 'w1', 'w2'], ['w4', 'w4'], ['w5'],
          ['w1', 'w2', 'w2', 'w3'],['w1', 'w2', 'w3', 'w1', 'w2'], ['w4', 'w4'], ['w5']]

from collections import Counter
vocabulary = Counter(itertools.chain.from_iterable(corpus))
vectorize_word_context(corpus, vocabulary, left_window=2, right_window=2, split_contexts=0)

In [None]:
class Corpus(object):
    
    token_sep = '<TS>'
    doc_sep = '<DS>'
    default_tokenizer = lambda s: s.split(' ')
    
    def __init__(self, data):
        self.data = data
    
    @classmethod
    def from_binary_files(cls, filenames, decode='utf-8', tokenizer=None):
        if tokenizer is None:
            tokenizer = cls.default_tokenizer
        texts_list = []
        if isinstance(filenames, list):
            for filename in filenames:
                with open(filename, 'rb') as file:
                    texts_list.append(cls.token_sep.join(tokenizer(file.read().decode(decode))))
        elif isinstance(filenames, str):
            with open(filenames, 'rb') as file:
                texts_list.append(cls.token_sep.join(tokenizer(file.read().decode(decode))))
        data = cls.doc_sep.join(texts_list)
        return cls(data)
    
    @classmethod
    def from_text_files(cls, filenames, tokenizer=None):
        if tokenizer is None:
            tokenizer = cls.default_tokenizer
        texts_list = []
        if isinstance(filenames, list):
            for filename in filenames:
                with open(filename, 'r') as file:
                    texts_list.append(cls.token_sep.join(tokenizer(file.read())))
        elif isinstance(filenames, str):
            with open(filenames, 'r') as file:
                texts_list.append(cls.token_sep.join(tokenizer(file.read())))
        data = cls.doc_sep.join(texts_list)
        return cls(data)
    
    @classmethod
    def from_strings(cls, texts, tokenizer=None):
        if tokenizer is None:
            tokenizer = cls.default_tokenizer
        texts_list = []
        if isinstance(filenames, list):
            for text in texts:
                texts_list.append(cls.token_sep.join(tokenizer(text)))
        elif isinstance(filenames, str):
            texts_list.append(cls.token_sep.join(tokenizer(text)))
        data = self.doc_sep.join(texts_list)
        return cls(data)
    
    @classmethod
    def from_csv_file(cls, filename, fieldname, tokenizer=None, **kwargs):
        ds = pd.read_csv(filename, **kwargs)[fieldname]
        data = cls.doc_sep.join(ds.str.apply(tokenizer).str.join(cls.token_sep).tolist())
        return cls(data)
    
    def __repr__(self):
        return "Corpus object\nNumber of docs = {}\nNumber of tokens = {}".format(self.docs_num, self.tokens_num)
    
    def __str__(self):
        printed_text = ''
        num_print_docs = min(self.docs_num,5)
        unk_token_idx = self.vocabulary.max_idx
        for i in range(num_print_docs):
            doc = self.data[i]
            if len(doc) <= 5:
                printed_text += repr([self.vocabulary.index_to_token(idx) if idx != unk_token_idx \
                                      else self.vocabulary.unk_token for idx in doc]) 
            else:
                printed_text += repr([self.vocabulary.index_to_token(idx) if idx != unk_token_idx \
                                      else self.vocabulary.unk_token for idx in doc[:4]])[:-1] + ', ...]'
            if i < num_print_docs:
                printed_text += '\n'
        if num_print_docs != self.docs_num:
            printed_text += '...'
        return printed_text

    def __len__(self):
        return self.tokens_num
    
    def __getitem__(self,tk_or_idx):
        if isinstance(tk_or_idx, int):
            return self.data[tk_or_idx]
        if isinstance(tk_or_idx, str):
            return [i for doc in self.data for i, tk in enumerate(doc)]
        raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        
    def __iter__(self):
        return (doc.split(self.doc_sep) for doc in self.data.split(self.token_sep) )
    
    def __contains__(self,key):
        return key in self.vocabulary
    

corpus = Corpus.from_text_files('../Utils/Datasets/wiki2018/parts/xaa')
for i in map(str.split,corpus.data):
    print(i[:100])

# Intento 2:

In [None]:
class TextClassificationDataset(Dataset):
    
    def __init__(self):
        pass
    
    @classmethod
    def from_csv(cls, filename, preprocessing=None, **kwargs):
        df = pd.read_csv(filename, **kwargs)
        if preprocessing is not None:
            df = preprocessing(df)
        return cls()