In [49]:
import nltk 

In [50]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    return documents, labels

In [51]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')
documents.pop(-1)
val_documents.pop(-1)
labels.pop(-1)
val_labels.pop(-1)

''

In [16]:
from nltk import TweetTokenizer
from nltk.corpus import stopwords

def process_documents(documents):
    # tokenize each document
    documents_tokenized = []
    tokenizer = TweetTokenizer()
    for doc in documents:
        documents_tokenized.append(tokenizer.tokenize(doc.lower()))
    return documents_tokenized

def remove_stop_words(documents):
    # build dictionary of stopwords
    stopwords_dict = {word:1 for word in stopwords.words('spanish')}
    non_stop_documents = []
    for doc in documents:
        ndoc = []
        for word in doc:
            if stopwords_dict.get(word) == None:
                ndoc.append(word)
        non_stop_documents.append(ndoc)
    
    return non_stop_documents

In [35]:
from nltk.probability import FreqDist
import numpy as np

class TCOR:
    def __init__(self):
        self.voc_index = {}
        self.T = 0
        
    def get_vocabulary(self, documents, T):
        # get vocabulary
        tokens = [token for doc in documents for token in doc]
        vocabulary = FreqDist(tokens)
        print(len(vocabulary.keys()))
        
        self.T = min(T, len(vocabulary.keys()))
        
        # get most common words
        limited_voc = vocabulary.most_common(self.T)
        
        # get index of words in matrix
        for i, word_count in enumerate(limited_voc):
            self.voc_index[word_count[0]] = i
    
    def build_matrix(self, documents, window_size, T=5000, voc_index=None, mode='train'):
        # get most common terms - training mode
        if mode == 'train':
            if voc_index==None:
                self.get_vocabulary(documents, T) #use most common words as vocabulary
            else:
                # use vocabulary index sent as parameter. Usefull when performing a features reduction or working with n-grams
                self.voc_index = voc_index
                self.T = len(voc_index.keys())
        
        term_matrix = self.frequency(documents, window_size)
        return term_matrix
    
    def frequency(self, documents, window_size):
        term_matrix = np.zeros((self.T, self.T))
        
        # tf scheme
        for doc in documents:
            for c, center in enumerate(doc):
                start, end = max(0, c-window_size), min(len(doc), c + window_size + 1)
                context_words = doc[start:c] + doc[c+1:end]
                context_index = [j for j in map(self.voc_index.get, context_words) if j != None]
                i  = self.voc_index.get(center)
                if i == None:
                    continue
                    
                for j in context_index:
                    term_matrix[i, j] += 1
        
        return term_matrix

In [None]:
from nltk.probability import FreqDist
import numpy as np

class RandomIndex:
    def __init__(self):
        self.voc_index = {}
        self.T = 0
        
    def get_vocabulary(self, documents, T):
        # get vocabulary
        tokens = [token for doc in documents for token in doc]
        vocabulary = FreqDist(tokens)
        print(len(vocabulary.keys()))
        
        self.T = min(T, len(vocabulary.keys()))
        
        # get most common words
        limited_voc = vocabulary.most_common(self.T)
        
        # get index of words in matrix
        for i, word_count in enumerate(limited_voc):
            self.voc_index[word_count[0]] = i
    
    def build_matrix(self, documents, window_size, K, N1, T=5000, voc_index=None, mode='train'):
        # get most common terms - training mode
        if mode == 'train':
            if voc_index==None:
                self.get_vocabulary(documents, T) #use most common words as vocabulary
            else:
                # use vocabulary index sent as parameter. Usefull when performing a features reduction or working with n-grams
                self.voc_index = voc_index
                self.T = len(voc_index.keys())
        
        random_matrix = self.init_vector(K, N1)
        term_matrix = self.frequency(documents, window_size, random_matrix)
        return term_matrix
    
    
    def init_vector(self, K, N1):
        context_matrix = np.zeros((self.T, K))
        values = [1 for _ in range(N1)] + [-1 for _ in range(N1)]
        indexes = [np.random.choice(K), size=N1*2, replace=False) for _ in range(self.T)]
        
        for i in range(self.T):
            context_matrix[i, indexes[i]] = values
        
        return context_matrix
    
    def frequency(self, documents, window_size, random_v):
        term_matrix = np.zeros((self.T, self.T))
        for i in len(term_matrix):
            term_matrix[i, np.random.choice(len(zeros), 6)] = values
        
        # tf scheme
        for doc in documents:
            for c, center in enumerate(doc):
                start, end = max(0, c-window_size), min(len(doc), c + window_size + 1)
                context_words = doc[start:c] + doc[c+1:end]
                context_index = [j for j in map(self.voc_index.get, context_words) if j != None]
                i  = self.voc_index.get(center)
                if i == None:
                    continue
                    
                for j in context_index:
                    term_matrix[i] += random_v[j]
        
        return term_matrix

In [36]:
documents = []
documents.append('hola como estas, hoy voy a comer pizza')
documents.append('hoy voy a comer pizza con mis amigos')
documents.append('hola, quiero la receta del pie de manzana')
documents.append('te gusta el pie de manzana')
documents.append('juan fue a comer pizza')

documents = process_documents(documents)

In [37]:
tcor = TCOR()

In [46]:
matrix = tcor.build_matrix(documents, 1, T=100)

24


In [None]:
from nltk import TweetTokenizer
from nltk.probability import FreqDist
import numpy as np

class BoWBuilder:
    # UTILITIES
    def get_dimensions(self):
        return self.voc_index.keys()
    
    # INIT FUNCTIONS
    def __init__(self):
        # attributes
        self.voc_index = {} 
        self.train_idf = None
        self.T = 0
        
    def get_vocabulary(self, documents, T):
        # get vocabulary
        tokens = [token for doc in documents for token in doc]
        vocabulary = FreqDist(tokens)
        
        self.T = np.min(T, len(vocabulary.keys()))
        
        # get most common words
        limited_voc = vocabulary.most_common(T)
        self.voc_index = {}
        
        # get index of words in matrix
        for i, word_count in enumerate(limited_voc):
            self.voc_index[word_count[0]] = i
    
    
    # BUILD BOW MATRIX
    def build_bow(self, documents, T=5000, voc_index=None, mode='train', weight_scheme='binary', normalize=False):
        # get most common terms - training mode
        if mode == 'train':
            if voc_index==None:
                self.get_vocabulary(documents, T) #use most common words as vocabulary
            else:
                # use vocabulary index sent as parameter. Usefull when performing a features reduction or working with n-grams
                self.voc_index = voc_index
                self.T = len(voc_index.keys())
        
        # use train_idf, testing mode
        use_train_idf = mode != 'train'
        
        # get weights for matrix
        if weight_scheme == 'tf':
            bow = self.frequency_bow(documents)
        elif weight_scheme == 'tf-idf':
            # if documents!= None, use existing idf weights (val or test mode)
            bow = self.frequency_bow(documents, use_idf=True, use_train_idf=use_train_idf)
        else:
            bow = self.binary_bow(documents)
        
        # normalize if necessary
        if normalize:
            norm = np.linalg.norm(bow, axis=1)
            # Add 1 if norm == 0 to avoid division by 0. --  Increase 1 dimension for broadcast 
            bow = bow / (norm + (norm==0 + 0.0))[:, np.newaxis]
        
        return bow
            
    # WEIGHT SCHEMES
    def binary_bow(self, documents):
        N = len(documents)
        T = self.T
        
        bow = np.zeros((N, T))
        for i, doc in enumerate(documents):
            for word in doc:
                j = self.voc_index.get(word)
                if j != None:
                    bow[i, j] = 1 
        
        return bow
    
    def frequency_bow(self, documents, use_idf=False, use_train_idf=False):
        N = len(documents)
        T = self.T
        bow = np.zeros((N, T))
        
        # tf scheme
        for i, doc in enumerate(documents):
            for word in doc:
                j = self.voc_index.get(word)
                if j != None:
                    bow[i, j] += 1 
        
        # tf-idf scheme
        if use_idf:
            if not use_train_idf:
                # calculate idf for first time (training mode)
                self.train_idf = np.sum(bow>0, axis=0)
            
            bow = np.log(bow + 1) * np.log(N/self.train_idf)
            
        return bow        

# Pruebas

In [12]:
d = {1: 2, 2:4, 3:6}

l = [1, 3, 4]

l2 = [v for v in map(d.get, l) if v != None]



In [14]:
for c, v in enumerate(l2):
    print(l2[c], v)

2 2
6 6


In [202]:
zeros = np.zeros((5,15))
values = [1 for _ in range(2)] + [-1 for _ in range(2)]


In [138]:
values

[1, 1, 1, -1, -1, -1]

In [203]:
zeros

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [198]:
indexes = [np.random.choice(len(zeros[0]), size=4, replace=False) for _ in range(5)]

In [199]:
indexes

[array([ 3,  7,  4, 13]),
 array([ 8,  1, 12, 14]),
 array([13,  0,  7,  9]),
 array([ 0, 12, 14, 13]),
 array([4, 2, 1, 5])]

In [207]:
zeros[:, indexes[:]] = values

SyntaxError: invalid syntax (3080696995.py, line 1)

In [230]:
T = 2000
K = 2048
N1 = 8
context_matrix = np.zeros((T, K))
values = [1 for _ in range(N1)] + [-1 for _ in range(N1)]
indexes = [np.random.choice(K, size=N1*2, replace=False) for _ in range(T)]
        
for i in range(T):
    context_matrix[i, indexes[i]] = values
        

In [231]:
s_ = 0
for x in context_matrix:
    for y in context_matrix:
        s_ += np.dot(x, y)

In [232]:
s_/(1000*1000)

0.032098