In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

# 1.- First of all we are going to clean the text and prepare it into split sentences

In [2]:
with open('ensayos.txt') as fp:
    ff = str(fp.read())

In [3]:
elements = set([])
for element in ff:
    elements.add(element)

elements = sorted(list(elements)) # ' ','!','(',')','*','-','.',':',';','?','¡','ª','¿','’'
# r'(!|(|)|*|-|.|:|?|’|;|¡|ª|¿|,|\s)\s*'

In [4]:
sentences = ff.replace('\\r','').replace('\n','').replace('»','').replace('«','').strip().split('.')

In [5]:
symbols = [' ','!','(',')','-','.',':','?','’',';','¡','ª','¿',',','\s','\s*','\n']
list_of_strings = []


for sentence in sentences:
    
    string = []
    list_of_elements = []
    for element in sentence:
    
        if element in symbols: # if the element is either a number or a letter then we added to the string component
            list_of_elements.append(''.join(string))
            string = []
            list_of_elements.append(element)
        else:
            string.append(element)
    list_of_elements.append(''.join(string))     
    list_of_strings.append(list_of_elements)

In [6]:
corpus = []
for sentence in list_of_strings:
    string = []
    for component in sentence:
        if component != ' ' and component !='':
            string.append(component)
    corpus.append(string)

In [7]:
len(corpus)

196

# 2.- Set of functions

### All the elements included in the corpus

In [8]:
def wordDictionary(corpus):
    
    different_words = set([])
    
    [different_words.add(word) for sentence in corpus for word in sentence]
    num_words = len(different_words)
    different_words = sorted(list(different_words))
    
    return different_words, num_words

### Co-occurence matrix

In [9]:
def co_ocurrenceMatrix(corpus, window = 5):
    
    words, num_words = wordDictionary(corpus)
    
    # Create a word dictionary to easily access to the index of each word
    word_to_idx = {}
    [word_to_idx.update({k:len(word_to_idx)}) for k in words]
    
    # Initializate the matrix
    M = np.zeros((num_words,num_words))
    
    # Filling the matrix with frequency co-ocurrences between words
    # For the matrix formation we are going to considered the rows as the center words and the columns as the contexts word
    # Going over each sentence in our corpus
    for sentence in corpus:
        # t represents the position in the sentence, t is associated with the center of thw window
        for t in range(len(sentence)):
            # t also is going to tell us which row we are going to update
            n_row = word_to_idx[sentence[t]]
            # i represents the position on the left and right of our center word we are analysing within the range 
            # of our window
            for i in range(1,window+1):
                # if the word is not out of range of the sentence lenght then we will access the column corresponding to
                # that word to add one to the frequency 
                if t - i > -1:
                    n_col = word_to_idx[sentence[t-i]]
                    M[n_row,n_col] += 1
                
                if t + i < len(sentence):
                    n_col = word_to_idx[sentence[t+i]]
                    M[n_row,n_col] += 1
    # Returnning the co-ocurrence matrix and the word to index dictionary to easily find the words in the matrix        
    return M, word_to_idx

### Apply algorithm for reducing the dimensions of the matrix

In [10]:
def reducingDim(M, dim = 2):
    
    n_iters = 10
    
    svd = TruncatedSVD(dim,algorithm='randomized', n_iter =10)
    svd.fit(M)
    M_reduced_dim = svd.transform(M)
    
    return M_reduced_dim
        

### Plotting word vectors in the reduced dimensional space

In [14]:
# we dont want to plot all words, the plot may unreadable
def plotting(M_reduced_dim, word_to_idx, represent_words):
    plt.figure(figsize=(40,25))
    plt.axis([0,20,-4,4])
    for k,v in word_to_idx.items():
        if k in represent_words:
            
            plt.scatter(M_reduced_dim[v,0],M_reduced_dim[v,1])
            plt.annotate(k, (M_reduced_dim[v,0], M_reduced_dim[v,1]))  
            plt.savefig('testplot.png')


# 3.- Application of the algorithm on the loaded corpus

In [None]:
# we can change the size of the window by passing the argument window with the desired size, by default is 5
M, word_to_idx = co_ocurrenceMatrix(corpus)
# we can change the size of the dimensional space by passing the argument dim with the desired size, by default is 2
M_reduced_dim = reducingDim(M)

# Plotting the selected words in the vector space
different_words, num_words = wordDictionary(corpus)
plotting(M_reduced_dim,word_to_idx,different_words[100:400])

# Rescale (normalize) the rows to make them each of unit-length
#M_lengths = np.linalg.norm(M_reduced_dim, axis=1)
#M_normalized = M_reduced_dim / M_lengths[:, np.newaxis] # broadcasting

#plotting(M_normalized,word_to_idx,different_words[100:300])