<img src='https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQ-VfNtOyJbsaxu43Kztf_cv1mgBG6ZIQZEVw&usqp=CAU'>

# Procesamiento de Lenguage Natural

## Taller #8: Modelado de temas
`Fecha de entrega: 🎃 Octubre 31, 2020. (Antes del inicio de la próxima clase).`

`Modo de entrega: Subir link de GitHub al aula virtual.`

In [18]:
import re
import pandas as pd 
from pprint import pprint

from nltk.corpus import stopwords
stopwords = stopwords.words('spanish')

import pyLDAvis.gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [19]:
# Cargar datos
path = 'archivos/reviews_vidjew_es.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,es_0825565,product_es_0370490,reviewer_es_0174781,3,"Buen. Buena calidad, y buena presentación.",Contenta,es,jewelry
1,es_0227934,product_es_0354224,reviewer_es_0411613,3,"Un producto a perfecto, para salir de casa con...",Versatilidad,es,video_games
2,es_0468601,product_es_0665460,reviewer_es_0348315,1,No funciona con Nintendo Switch. No hay forma ...,Decepción absoluta,es,video_games
3,es_0814494,product_es_0692692,reviewer_es_0951508,5,"Recomendado, los utilizo para pc y no me dan n...",Auriculares Pecham ps4,es,video_games
4,es_0206329,product_es_0728826,reviewer_es_0493255,4,El cable funciona bien podria ser un poco mas ...,Perfecto,es,video_games


###  `[12 pts]` Punto 1: Hacer pre-procesamiento del texto

In [20]:
def pre_procesado(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = [palabra for palabra in texto.split() if palabra not in stopwords]
    return texto

data['pp'] = data['review_body'].apply(lambda texto: pre_procesado(texto))

data.head()

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category,pp
0,es_0825565,product_es_0370490,reviewer_es_0174781,3,"Buen. Buena calidad, y buena presentación.",Contenta,es,jewelry,"[buen, buena, calidad, buena, presentación]"
1,es_0227934,product_es_0354224,reviewer_es_0411613,3,"Un producto a perfecto, para salir de casa con...",Versatilidad,es,video_games,"[producto, perfecto, salir, casa, nintendo, sw..."
2,es_0468601,product_es_0665460,reviewer_es_0348315,1,No funciona con Nintendo Switch. No hay forma ...,Decepción absoluta,es,video_games,"[funciona, nintendo, switch, forma, emparejarl..."
3,es_0814494,product_es_0692692,reviewer_es_0951508,5,"Recomendado, los utilizo para pc y no me dan n...",Auriculares Pecham ps4,es,video_games,"[recomendado, utilizo, pc, dan, ningún, proble..."
4,es_0206329,product_es_0728826,reviewer_es_0493255,4,El cable funciona bien podria ser un poco mas ...,Perfecto,es,video_games,"[cable, funciona, bien, podria, ser, mas, larg..."


###  `[13 pts]` Punto 2: Modelo de LDA

In [21]:
# Crear una representación de los documentos en forma de diccionario
dictionary = Dictionary(data['pp'].values)

# Filtrar palabras muy frecuentes o infrecuentes
dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus = [dictionary.doc2bow(text) for text in data['pp'].values]

#Train the topic model
model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=6, passes=50)

###  `[25 pts]` Punto 3: Visualización de LDA

In [29]:
lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
# pyLDAvis.save_html(lda_display, 'lda.html')

# Más

In [23]:
model.print_topics(num_words=10)

[(0,
  '0.037*"producto" + 0.033*"pulsera" + 0.032*"perfecto" + 0.026*"precio" + 0.022*"buen" + 0.020*"ve" + 0.019*"regalo" + 0.016*"tiempo" + 0.016*"bonita" + 0.015*"llegó"'),
 (1,
  '0.048*"calidad" + 0.039*"precio" + 0.038*"buena" + 0.029*"mando" + 0.026*"si" + 0.017*"ps" + 0.015*"cumple" + 0.013*"original" + 0.012*"consola" + 0.012*"solo"'),
 (2,
  '0.042*"bonito" + 0.036*"calidad" + 0.034*"bastante" + 0.026*"bien" + 0.017*"foto" + 0.016*"colgante" + 0.016*"mas" + 0.016*"parece" + 0.016*"cierre" + 0.015*"pequeño"'),
 (3,
  '0.064*"juego" + 0.018*"bien" + 0.017*"producto" + 0.014*"llegado" + 0.014*"viene" + 0.012*"aunque" + 0.012*"amazon" + 0.012*"español" + 0.012*"precio" + 0.012*"si"'),
 (4,
  '0.067*"bien" + 0.024*"si" + 0.021*"día" + 0.016*"producto" + 0.016*"juego" + 0.016*"regalo" + 0.013*"nunca" + 0.012*"quedan" + 0.012*"después" + 0.010*"ser"'),
 (5,
  '0.038*"bonitos" + 0.023*"pendientes" + 0.023*"grandes" + 0.021*"plata" + 0.018*"solo" + 0.018*"bien" + 0.016*"tal" + 0.016*

In [24]:
def get_doc_top_n(text_processed_lemma, n):
    d = dictionary.doc2bow(text_processed_lemma)
    topics = model.get_document_topics(d)
    try:
        return topics[n][1]
    except:
        return None

In [25]:
for t in range(0,7):
    top_name = f"topic_{t}"
    data[top_name] = data['pp'].apply(lambda doc: get_doc_top_n(doc, t))

In [26]:
def get_doc_top_n(text_processed, n):
    d = dictionary.doc2bow(text_processed)
    topics = dict(model.get_document_topics(d))
    try:
        return topic[n]
    except:
        return None

In [27]:
data

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category,pp,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,es_0825565,product_es_0370490,reviewer_es_0174781,3,"Buen. Buena calidad, y buena presentación.",Contenta,es,jewelry,"[buen, buena, calidad, buena, presentación]",0.028049,0.860583,0.027880,0.027846,0.027778,0.027863,
1,es_0227934,product_es_0354224,reviewer_es_0411613,3,"Un producto a perfecto, para salir de casa con...",Versatilidad,es,video_games,"[producto, perfecto, salir, casa, nintendo, sw...",0.479464,0.408639,0.027850,0.027961,0.027999,0.028054,
2,es_0468601,product_es_0665460,reviewer_es_0348315,1,No funciona con Nintendo Switch. No hay forma ...,Decepción absoluta,es,video_games,"[funciona, nintendo, switch, forma, emparejarl...",0.023832,0.723744,0.023883,0.024054,0.024001,0.180467,
3,es_0814494,product_es_0692692,reviewer_es_0951508,5,"Recomendado, los utilizo para pc y no me dan n...",Auriculares Pecham ps4,es,video_games,"[recomendado, utilizo, pc, dan, ningún, proble...",0.020834,0.020949,0.020907,0.601088,0.315257,0.021068,
4,es_0206329,product_es_0728826,reviewer_es_0493255,4,El cable funciona bien podria ser un poco mas ...,Perfecto,es,video_games,"[cable, funciona, bien, podria, ser, mas, larg...",0.014195,0.014007,0.013976,0.013970,0.929849,0.013994,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,es_0427672,product_es_0899883,reviewer_es_0474735,2,En la foto parece que la cadena sea más gruesa...,Cadena muy fina,es,jewelry,"[foto, parece, cadena, gruesa, fina, precio, p...",0.018940,0.018761,0.018800,0.906249,0.018642,0.018607,
996,es_0059540,product_es_0702564,reviewer_es_0754753,3,"Bien por su precio, la cadena demasiado fina.",E,es,jewelry,"[bien, precio, cadena, demasiado, fina]",0.027934,0.028122,0.493163,0.394512,0.028016,0.028056,
997,es_0303349,product_es_0678671,reviewer_es_0172313,5,Muy bien!! Ahora tengo organizados los pendien...,Bonito,es,jewelry,"[bien, ahora, organizados, pendientes]",0.041912,0.041877,0.041923,0.041978,0.042729,0.789636,
998,es_0659961,product_es_0294943,reviewer_es_0554554,4,"Las tapas para los joystics no están nada mal,...",No está mal pero sólo he probado las fundas.,es,video_games,"[tapas, joystics, mal, ayudan, montón, joystic...",0.012850,0.012966,0.012878,0.012919,0.935504,0.012873,


In [11]:
data.pp

0            [buen, buena, calidad, buena, presentación]
1      [producto, perfecto, salir, casa, nintendo, sw...
2      [funciona, nintendo, switch, forma, emparejarl...
3      [recomendado, utilizo, pc, dan, ningún, proble...
4      [cable, funciona, bien, podria, ser, mas, larg...
                             ...                        
995    [foto, parece, cadena, gruesa, fina, precio, p...
996              [bien, precio, cadena, demasiado, fina]
997               [bien, ahora, organizados, pendientes]
998    [tapas, joystics, mal, ayudan, montón, joystic...
999    [primer, impulso, ponerlo, venta, probarlo, hi...
Name: pp, Length: 1000, dtype: object

In [12]:
model.get_topics()

array([[1.59440664e-04, 1.61700416e-04, 1.59070740e-04, ...,
        4.92493995e-03, 2.06658826e-03, 1.61756980e-04],
       [2.56243087e-02, 3.39241028e-02, 5.49567938e-02, ...,
        9.49938694e-05, 9.50454123e-05, 6.63611921e-04],
       [5.09111676e-03, 1.52298962e-04, 1.03981812e-02, ...,
        1.29483786e-04, 2.43460247e-03, 3.20318947e-03],
       [5.32492669e-03, 1.10978144e-04, 1.16083422e-04, ...,
        1.08567787e-04, 1.09326102e-04, 1.09529785e-04],
       [1.41403929e-04, 1.41708908e-04, 2.34563015e-02, ...,
        1.41381766e-04, 1.45675862e-04, 1.41606011e-04],
       [3.91358184e-03, 7.79179297e-03, 1.27858566e-02, ...,
        1.32865534e-04, 1.72285561e-03, 1.32866146e-04]], dtype=float32)

In [32]:
d = dictionary.doc2bow(["juego", "perfecto", "jugar", "rápido"])
topics = model.get_document_topics(d, per_word_topics=True)
topics

([(0, 0.3236995),
  (1, 0.033649843),
  (2, 0.033614073),
  (3, 0.3578529),
  (4, 0.2175755),
  (5, 0.03360821)],
 [(6, [0, 3]), (11, [4, 3]), (56, [3, 4]), (121, [0, 3, 4])],
 [(6, [(0, 0.79229325), (3, 0.20720637)]),
  (11, [(3, 0.32381338), (4, 0.67282236)]),
  (56, [(3, 0.89078563), (4, 0.10911727)]),
  (121, [(0, 0.65971786), (3, 0.19803786), (4, 0.14145988)])])

In [33]:
d = dictionary.doc2bow(["arete", "calidad", "pulsera"])
topics = model.get_document_topics(d, per_word_topics=True)
topics

([(0, 0.7190304),
  (1, 0.057244446),
  (2, 0.056747116),
  (3, 0.055791214),
  (4, 0.055555988),
  (5, 0.05563088)],
 [(2, [0]), (128, [0])],
 [(2, [(0, 0.9905914)]), (128, [(0, 0.999933)])])

In [15]:
dictionary.doc2bow(["arete", "calidad", "pulsera"])

[(2, 1), (128, 1)]

In [34]:
dictionary.token2id['juego']

56