## __Text mining y Procesamiento de Lenguaje Natural (NLP)__

__Profesor__: Anthony D. Cho

__Tema__: Analisis semantico

__Método__: Latent Semantic Analysis (Word Embedding)

***

__Dependencias__

```{python}
    python -m pip install nltk gensim
    python -m spacy download en_core_web_sm
    python -m spacy download es_core_news_sm
```

## Librerias

In [None]:
import re
from glob import glob
import matplotlib.pyplot as plt
from pandas import DataFrame

from string import punctuation
from spacy.lang.es.stop_words import STOP_WORDS
from spacy import load

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

from sklearn.manifold import MDS

## Instancia del modelo de lenguaje
nlp = load('es_core_news_sm')

## Carga de documentos

In [None]:
## Encontrar la ruta de cada archivo de interes
path_docs = glob('*/doc*.txt')

## Almacenamiendo de contenido de los documentos e id (nombre del archivo)
corpus, doc_id = [], [] 

## Incio de proceso de carga de documentos
if len(path_docs):
    for file in path_docs:

        ## Se carga el texto
        text = open(file, 'r', encoding='utf-8').read()
        
        ## Se almacena el texto
        corpus.append(text)
        
        id = file.split('\\')[-1].split('.')[0]

        ## Se almacena el id
        doc_id.append(id)
else:
    print('No corpus have found.')

In [None]:
doc_id

#### Preprocesamiento

In [None]:
## Limpieza de textos
cleanTexts = []

for doc in corpus:

    # ## Remover numeros y puntuaciones
    doc = re.sub(r'[\"\¿\°\d+]', '', doc)
    doc = [s for s in doc if s not in punctuation]
    doc = ''.join(doc)

    ## Normalización y remover stopwords
    documento = nlp(doc.lower())
    tokens = [word.text for word in documento]
    doc = [word for word in tokens if word not in STOP_WORDS]
    doc = ' '.join(doc)
    doc = re.sub(pattern='\s+', repl=' ', string=doc)
    
    ## Aplicar lemmatización
    documento = nlp(doc)
    lemmas = [word.lemma_ for word in documento]
    doc = ' '.join(lemmas)
    doc = re.sub(pattern='\s+', repl=' ', string=doc)

    ## Almacenado de contenido procesado
    cleanTexts.append(doc.split(' '))

## Mostar contenido procesado
for line in cleanTexts:
    print(line)
    

## Modelo

In [None]:
## Instancia del modelo
model = Word2Vec(sentences=cleanTexts, ## <- lista de documentos en tokens
                 vector_size=4, ## <- Embedding size
                 window=2, 
                 min_count=1)

In [None]:
## Extraer la lista de vocabulario
vocabulary = model.wv.index_to_key
print(vocabulary)

In [None]:
## Representaciones en el espacio latente
print(model.wv['laptop'])
print(model.wv['razer'])

In [None]:
## Computo de similitud mediante coseno. 
model.wv.similarity('laptop', 'silicio')

In [None]:
## Consultar por los términos más similares
model.wv.most_similar('laptop')

In [None]:
model.wv.most_similar(positive='laptop', negative='carlos')

In [None]:
def vectorsVisualizer(vocabulary, vectors):
    """
        DESCRIPTION:
            2D-vector visualizer. The first 2 coordinates are taken for visualizing.
        
        INPUT:
            @param vocabulary: list of terms
            @type vocabulary: list
            
            @param vectors: latent space matrix representation
            @type vectors: numpy.ndarray
        
        OUTPUT:
            Scattered graphic
        
    """
    
    ## Pairs (x,y) allocation
    x, y = [], []
    for v in vectors:
        x.append( v[0] )
        y.append( v[1] )
    
    ## Display-space instance
    plt.figure(figsize=(12,12))
    plt.title('Vector representation')
    
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(text=vocabulary[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom'
                     )
    plt.xlabel('latent feature 1'); plt.ylabel('latent feature 2');
    plt.tight_layout()
    plt.show()

In [None]:
## Terms visualizer
mds_2D = MDS(normalized_stress='auto', random_state=10).fit_transform(model.wv.vectors)
vectorsVisualizer(vocabulary, mds_2D[:50])

In [None]:
## Representación del vocabulario en el espacio latente
vocabulary_latent = DataFrame(data=model.wv.vectors, index=vocabulary)
vocabulary_latent

In [None]:
similitudes = 1-cosine_similarity(model.wv.vectors)
plt.matshow(similitudes[:20, :20])
plt.show()