*Karen Tatiana Zamudio Quintero*

<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Word2vect


In [None]:
import pandas as pd
import numpy as np
import collections
import re

!pip3 install --upgrade pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas
  Downloading pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.4.4
    Uninstalling pandas-1.4.4:
      Successfully uninstalled pandas-1.4.4
Successfully installed pandas-1.5.3


In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [None]:
corpus = ['que dia es hoy',
          'martes el dia de hoy es martes',
          'martes muchas gracias' ]

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [None]:
corpus = ['que dia es hoy',
          'martes el dia de hoy es martes',
          'martes muchas gracias' ]

In [None]:
bagOfWords = [doc.split(" ") for doc in corpus]
bagOfWords

[['que', 'dia', 'es', 'hoy'],
 ['martes', 'el', 'dia', 'de', 'hoy', 'es', 'martes'],
 ['martes', 'muchas', 'gracias']]

In [None]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

Number of words in the corpus: 9
The words in the corpus: 
 {'martes', 'que', 'es', 'de', 'el', 'dia', 'hoy', 'muchas', 'gracias'}


### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [None]:
corpus = ['que dia es hoy',
          'martes el dia de hoy es martes',
          'martes muchas gracias' ]

In [None]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

In [None]:
# vocabulario del corpus

vocabulary = {element: index for index, element in enumerate(list(words_set))}

# Represento el corpus en One Hot Encoding
vocabulary = set()
for doc in corpus:
  vocabulary.update(doc.split())
vocabulary = sorted(vocabulary)
    
one_hot_array = np.zeros((len(corpus), len(vocabulary)), dtype=np.int32)
for i, doc in enumerate(corpus):
    for term in doc.split():
      j = vocabulary.index(term)
      one_hot_array[i, j] = 1

one_hot_array

array([[0, 1, 0, 1, 0, 1, 0, 0, 1],
       [1, 1, 1, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 0]], dtype=int32)

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [None]:
corpus = ['que dia es hoy',
          'martes el dia de hoy es martes',
          'martes muchas gracias' ]

In [None]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

Number of words in the corpus: 9
The words in the corpus: 
 {'martes', 'que', 'es', 'de', 'el', 'dia', 'hoy', 'muchas', 'gracias'}


In [None]:
n_docs = len(corpus)         
n_words_set = len(words_set) 
tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)
 
# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') 
    for w in words:
        tf[w][i] = tf[w][i] + 1 
         
tf

Unnamed: 0,martes,que,es,de,el,dia,hoy,muchas,gracias
0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,2.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [None]:
corpus = ['que dia es hoy',
          'martes el dia de hoy es martes',
          'martes muchas gracias' ]

In [None]:
words_set = set()
 
for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
     
print('Número de palabras en el corpus:',len(words_set))
print('Palabras en el corpus: \n', list(words_set))

Número de palabras en el corpus: 9
Palabras en el corpus: 
 ['martes', 'que', 'es', 'de', 'el', 'dia', 'hoy', 'muchas', 'gracias']


In [None]:
n_docs = len(corpus)         #Número de documentos en el corpus
n_words_set = len(words_set) #Número de palabras únicas en el
tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)
 
# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        tf[w][i] = tf[w][i] + 1 
         
tf

Unnamed: 0,martes,que,es,de,el,dia,hoy,muchas,gracias
0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,2.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [None]:
print("IDF of: ")
 
idf = {}
 
for w in words_set:
    k = 0    # número de documentos en el corpus que contienen esta palabra
     
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
             
    idf[w] =  np.log10(n_docs / k)
     
    print(f'{w:>15}: {idf[w]:>10}' )

IDF of: 
         martes: 0.17609125905568124
            que: 0.47712125471966244
             es: 0.17609125905568124
             de: 0.47712125471966244
             el: 0.47712125471966244
            dia: 0.17609125905568124
            hoy: 0.17609125905568124
         muchas: 0.47712125471966244
        gracias: 0.47712125471966244


In [None]:
tf_idf = tf.copy()
 
for w in words_set:
    for i in range(n_docs):
        tf_idf[w][i] = tf[w][i] * idf[w]
         
tf_idf

Unnamed: 0,martes,que,es,de,el,dia,hoy,muchas,gracias
0,0.0,0.477121,0.176091,0.0,0.0,0.176091,0.176091,0.0,0.0
1,0.352183,0.0,0.176091,0.477121,0.477121,0.176091,0.176091,0.0,0.0
2,0.176091,0.0,0.0,0.0,0.0,0.0,0.0,0.477121,0.477121


### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

In [None]:
def compare_documents(corpus, idx):
  words_set = set()

  # Separo los documentos en terminos
  for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))

  # vocabulario del corpus

  vocabulary = {element: index for index, element in enumerate(list(words_set))}

  # Represento el corpus en One Hot Encoding
  vocabulary = set()
  for doc in corpus:
    vocabulary.update(doc.split())
  vocabulary = sorted(vocabulary)
    
  one_hot_array = np.zeros((len(corpus), len(vocabulary)), dtype=np.int32)
  for i, doc in enumerate(corpus):
     for term in doc.split():
        j = vocabulary.index(term)
        one_hot_array[i, j] = 1

  # Frecuecia de cada termino en cada documento del corpus

  n_doc = len(corpus)     
  n_words_set = len(words_set) 
  frec_corp = pd.DataFrame(np.zeros((n_doc, n_words_set)), columns=words_set)
 
  for i in range(n_doc):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        frec_corp[w][i] = frec_corp[w][i] + 1 
   
  # se obtiene la frecuencia inversa de aparicion de los terminos en el corpus
  
  df_vocabulary  = result = np.sum(one_hot_array, axis=0)

  idf = {}
 
  for w in words_set:
    k = 0   
     
    for i in range(n_doc):
        if w in corpus[i].split():
            k += 1
             
    idf[w] =  np.log10(n_doc / k)


  # Se obtiene el indice TF-IDF

  tf_idf = frec_corp.copy()
 
  for w in words_set:
     for i in range(n_doc):
        tf_idf[w][i] = frec_corp[w][i] * idf[w]
         
  similarity = np.zeros(n_doc)
  for i in range(n_doc):
    similarity[i] = cosine_similarity(tf_idf.iloc[i], tf_idf.iloc[idx])

  corpus = np.array(corpus)

  return corpus[np.argsort(np.array(-similarity))]



In [None]:
corpus = ['que dia es hoy',
          'martes el dia de hoy es martes',
          'martes muchas gracias' ]

In [None]:
compare_documents(corpus, 0)

array(['que dia es hoy', 'martes el dia de hoy es martes',
       'martes muchas gracias'], dtype='<U30')