<h2>Documentos</h2>

In [81]:
import nltk
from nltk.corpus import stopwords
import math


documents = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time",
             "The EPS user interface management system","System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement","The generation of random binary unordered trees",
             "The intersection graph of paths in trees","Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey", "Human machine interface for machine learning applications"]

In [82]:
#Etapa de procesamiento ¿Que se hace?
tok_docs = [] #Documentos procesados
vocab = [] #Vocabulario, muy importante!

stop_words = set(stopwords.words('english')) #Lista en Ingles
for doc in documents:
    word_tok = nltk.word_tokenize(doc)
    filtered_sentence = [] 
    for w in word_tok: 
        w = w.lower()
        if w not in stop_words: 
            filtered_sentence.append(w)
            vocab.append(w)
    tok_docs.append(filtered_sentence)
vocab = set(vocab)

In [83]:
#Vocabulario y documentos despues de procesamiento
print(vocab)
print(len(vocab))
print(tok_docs)

{'generation', 'quasi', 'error', 'unordered', 'paths', 'abc', 'widths', 'minors', 'graph', 'ordering', 'binary', 'survey', 'relation', 'measurement', 'perceived', 'random', 'human', 'response', 'management', 'computer', 'well', 'trees', 'lab', 'opinion', 'time', 'user', 'iv', 'interface', 'eps', 'machine', 'applications', 'testing', 'intersection', 'engineering', 'system', 'learning'}
36
[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'], ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'management', 'system'], ['system', 'human', 'system', 'engineering', 'testing', 'eps'], ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'], ['generation', 'random', 'binary', 'unordered', 'trees'], ['intersection', 'graph', 'paths', 'trees'], ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey'], ['human', 'machine', 'interface', 'machine', 'learni

In [84]:
###Esta funcion crea el diccionario TF de cada documento
def computeDocsTFDict(doc):
    """ Retorna un diccionario de frecuencias  
    con las palabras unicas del documento.
    """
    #Counts the number of times the word appears in review
    TFDict = {}
    for word in doc:
        if word in TFDict:
            TFDict[word] += 1
        else:
            TFDict[word] = 1
    #Computes tf for each word           
    for word in TFDict:
        TFDict[word] = TFDict[word] / len(doc)
    return TFDict

In [85]:
tfDict = [] #En este diccionario se va a almacenar el conteo
for doc in tok_docs:
    tfDict.append(computeDocsTFDict(doc))
tfDict[0] #El indice entre tfDict y tok_docs corresponden a los mismos documentos

{'human': 0.14285714285714285,
 'machine': 0.14285714285714285,
 'interface': 0.14285714285714285,
 'lab': 0.14285714285714285,
 'abc': 0.14285714285714285,
 'computer': 0.14285714285714285,
 'applications': 0.14285714285714285}

<h2>Calculo de IDF</h2>

In [86]:
#Para el calculo de IDF necesitamos conocer primero cuantas veces una palabra aparace en los documentos del corpus
def computeWordCountDict(tfDict):
    """ 
    Devuelve un diccionario cuyos indices son todas las palabras unicas en el conjunto de datos y cuyos valores cuentan el numero de
    documentos en las que aparece la palabra
    """
    countDict = {}

    for doc in tfDict:
        for word in doc:
            if word in countDict:
                countDict[word] += 1
            else:
                countDict[word] = 1
    return countDict

countDict = computeWordCountDict(tfDict)
countDict

{'human': 3,
 'machine': 2,
 'interface': 3,
 'lab': 1,
 'abc': 1,
 'computer': 2,
 'applications': 2,
 'survey': 2,
 'user': 3,
 'opinion': 1,
 'system': 3,
 'response': 2,
 'time': 2,
 'eps': 2,
 'management': 1,
 'engineering': 1,
 'testing': 1,
 'relation': 1,
 'perceived': 1,
 'error': 1,
 'measurement': 1,
 'generation': 1,
 'random': 1,
 'binary': 1,
 'unordered': 1,
 'trees': 3,
 'intersection': 1,
 'graph': 3,
 'paths': 1,
 'minors': 2,
 'iv': 1,
 'widths': 1,
 'well': 1,
 'quasi': 1,
 'ordering': 1,
 'learning': 1}

In [87]:
#Para el calculo de IDF necesitamos conocer primero cuantas veces una palabra aparace en los documentos del corpus
def computeIDFDict(countDict):
    """ Devuelve un diccionario cuyos indices son palabras 
        y sus valores son el idf correspondiente.
    """
    idfDict = {}
    for word in countDict:
        idfDict[word] = math.log(len(documents) / countDict[word])
    return idfDict
  
idfDict = computeIDFDict(countDict)
idfDict

{'human': 1.2039728043259361,
 'machine': 1.6094379124341003,
 'interface': 1.2039728043259361,
 'lab': 2.302585092994046,
 'abc': 2.302585092994046,
 'computer': 1.6094379124341003,
 'applications': 1.6094379124341003,
 'survey': 1.6094379124341003,
 'user': 1.2039728043259361,
 'opinion': 2.302585092994046,
 'system': 1.2039728043259361,
 'response': 1.6094379124341003,
 'time': 1.6094379124341003,
 'eps': 1.6094379124341003,
 'management': 2.302585092994046,
 'engineering': 2.302585092994046,
 'testing': 2.302585092994046,
 'relation': 2.302585092994046,
 'perceived': 2.302585092994046,
 'error': 2.302585092994046,
 'measurement': 2.302585092994046,
 'generation': 2.302585092994046,
 'random': 2.302585092994046,
 'binary': 2.302585092994046,
 'unordered': 2.302585092994046,
 'trees': 1.2039728043259361,
 'intersection': 2.302585092994046,
 'graph': 1.2039728043259361,
 'paths': 2.302585092994046,
 'minors': 1.6094379124341003,
 'iv': 2.302585092994046,
 'widths': 2.302585092994046,


<h2>Calculo de TF-IDF</h2>

In [88]:
def computeDocsTFIDFDict(TFDict,idfDict):
    """ Devuelve un diccionario cuyas claves son todas las palabras unicas en
    la revision y cuyos valores son su tfidf correspondiente.
    """
    TFIDFDict = {}
    for word in TFDict:
        TFIDFDict[word] = TFDict[word] * idfDict[word]
    return TFIDFDict

tfidfDict = [computeDocsTFIDFDict(doc,idfDict) for doc in tfDict]
tfidfDict[0] #El indice entre tfidfDict y tok_docs corresponden a los mismos documentos

{'human': 0.17199611490370514,
 'machine': 0.22991970177630003,
 'interface': 0.17199611490370514,
 'lab': 0.32894072757057796,
 'abc': 0.32894072757057796,
 'computer': 0.22991970177630003,
 'applications': 0.22991970177630003}

<h2>Vectorizacion</h2>

In [89]:
#Vamos a utilizar el vocabulario alamacenado en vocab como dimensiones
def computeTFIDFVector(doc):
    tfidfVector = [0.0] * len(vocab) #Vector del tamano del vocabulario
    # Para cada palabra unica, si esta en el documento, se almacena su valor TF-IDF.
    for i, word in enumerate(vocab):
        if word in doc:
            tfidfVector[i] = doc[word]
    return tfidfVector

tfidfVector = [computeTFIDFVector(doc) for doc in tfidfDict]
tfidfVector[0] #Vector que representa el primer documento

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.32894072757057796,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.17199611490370514,
 0.0,
 0.0,
 0.22991970177630003,
 0.0,
 0.0,
 0.32894072757057796,
 0.0,
 0.0,
 0.0,
 0.0,
 0.17199611490370514,
 0.0,
 0.22991970177630003,
 0.22991970177630003,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

<h2>Similitud Coseno</h2>

In [90]:
def dot_product(vector_x, vector_y):
    dot = 0.0
    for e_x, e_y in zip(vector_x, vector_y):
       dot += e_x * e_y
    return dot

def magnitude(vector):
    mag = 0.0
    for index in vector:
        mag += math.pow(index, 2)
    return math.sqrt(mag)

In [91]:
doc_similarity_0_1 = dot_product(tfidfVector[0], tfidfVector[1])/ magnitude(tfidfVector[0]) * magnitude(tfidfVector[1])
doc_similarity_0_1

0.04937924273636205

<h3>Tarea: Buscar los documentos mas similares entre si en nuestro corpus</h3>