# T5 - Juan Luis Baldelomar Cabrera

In [1]:
import nltk 
import numpy as np
import nltk
from nltk.probability import FreqDist
from nltk import TweetTokenizer
from nltk.corpus import stopwords

In [2]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    documents.pop(-1)
    labels.pop(-1)
    return documents, labels

In [3]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')

In [4]:
def get_vocabulary(tokenized_docs, n):
    tokens = [token for doc in tokenized_docs for token in doc]
    unique_tokens = FreqDist(tokens).most_common(n)
    return [token for token, _ in unique_tokens]

def word2ids(vocabulary):
    word2id = {}
    id2word = {}
    
    # build both dictionaries
    for i, word in enumerate(vocabulary):
        word2id[word] = i
        id2word[i] = word
    
    # add special tokens
    n = len(word2id)
    word2id['<s>']   = n 
    word2id['</s>']  = n + 1
    word2id['<unk>'] = n + 2
    id2word[n]       = '<s>'
    id2word[n + 1]   = '</s>'
    id2word[n + 2]   = '<unk>'
    
    return word2id, id2word

In [5]:
class NGramBuilder:
    def __init__(self, tokenizer=None, embeddings=None, punctuation=None, postprocess=None):
        self.tokenizer = self.default_tokenizer() if tokenizer == None else tokenizer
        self.embeddings = embeddings
        self.word2id = None
        self.id2word = None
        self.punctuation = set(punctuation) if punctuation != None else None
        self.postprocess = postprocess
        
    def default_tokenizer(doc):
        return TweetTokenizer().tokenize
    
    def get_vocabulary(self):
        return set(self.word2id.keys())
    
    def remove_punct(self, tokenized_documents):
        if self.punctuation == None:
            return tokenized_documents
        else:
            return [[token for token in doc if token not in self.punctuation] for doc in tokenized_documents]
        
    def _transform(self, tokenized_docs):
        N = self.N
        # docs and labels lists
        ngram_docs, ngram_targs = [], []
        # traverse each doc
        for doc in tokenized_docs:
            # add padding
            doc = ['<s>']*(N - 1)  + doc + ['</s>']
            # empty ngram and targets
            ngram_doc, ngram_tar = [], []
            # build list of ids from word2id dict 
            unk_id = self.word2id.get('<unk>', 0)
            ids = [self.word2id.get(word, unk_id) for word in doc]
            # traverse each word as center and build ngrams
            for i in range(N-1, len(doc)):    
                ngram_doc.append(ids[i-(N-1): i])
                ngram_tar.append(ids[i])    
            # append document and labels
            ngram_docs.append(ngram_doc)
            ngram_targs.append(ngram_tar)

        return ngram_docs, ngram_targs
    
    def fit(self, documents, N, t=10000):
        self.N = N
        # tokenize documents
        tokenized_docs = [self.tokenizer(doc.lower()) for doc in documents]
        tokenized_docs = self.remove_punct(tokenized_docs)
        if self.postprocess is not None:
            tokenized_docs = self.postprocess(tokenized_docs)
            
        # get vocabulary and word2id and ids2word dicts
        vocabulary = get_vocabulary(tokenized_docs, t)
        self.word2id, self.id2word = word2ids(vocabulary)
        
        return self._transform(tokenized_docs)
    
    def transform(self, documents):
        # tokenize, get vocabulary and word2id and ids2word dicts
        tokenized_docs = tokenize_documents(documents, self.tokenizer)
        return self._transform(tokenized_docs)
    
    def inverse(self, docs_as_ids):
        return [list(map(self.id2word.get, doc)) for doc in docs_as_ids]

In [59]:
ngram_builder = NGramBuilder()
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=3)

In [None]:
documents[0]

In [None]:
ngram_builder.inverse(ngram_labels)[1]

In [8]:
def char_postprocess(documents):
    return [[c for c in word] for doc in documents for word in doc]        

In [None]:
x = char_postprocess([['hola', 'mundo'], ['doc', 'no']])

In [9]:
ngram_builder = NGramBuilder(postprocess=char_postprocess)
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=3)

In [10]:
ngram_builder.inverse(ngram_labels)[0]

['l', 'o', '</s>']