In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import strip_accents_unicode

import re

import nltk
from nltk.tokenize import RegexpTokenizer

from gsdmm.gsdmm import MovieGroupProcess

In [11]:
def holdout_text(series,train_pct = 0.9):
    '''Input : Series de dados
       Output : train,test a partir da serie de dados completamente aleatoria'''

    if(type(series) == list):
        series = pd.Series(series)
        
    num_linhas = len(series)
    series = series.sample(num_linhas)#.reset_index(drop = True)

    train_pct = train_pct

    train = series.iloc[:round(num_linhas*train_pct)]
    test = series.iloc[round(num_linhas*train_pct):]
    
    return train,test

#### Carregando Dataset Americanas

In [12]:
df = pd.read_csv("B2W-Reviews01.csv",sep = ';')

In [13]:
#dados de texto
text_data = df['review_text']
#text_title = df['review_title']

#stopwords
spwrds = nltk.corpus.stopwords.words('portuguese')
spwrds =[strip_accents_unicode(sw) for sw in spwrds]

def fast_preprocessing(text_data,spwrds):
    pp_text = [strip_accents_unicode(phrases) for phrases in text_data]
    pp_text = [phrases.lower() for phrases in pp_text]

    #regex_numbers 
    pp_text = [re.sub(r'\d+','<NUM>', phrases) for phrases in pp_text]

    #strip para ter certeza de espacos
    #pp_text = [phrases.strip() for phrases in pp_text]

    #tokenizacao
    rgxtoken = RegexpTokenizer('\w+')
    pp_text = [rgxtoken.tokenize(phrases) for phrases in pp_text]

    sentence = list()
    corpus = list()

    for phrases in pp_text:
        for word in phrases:
            if(word not in spwrds):
                sentence.append(word)

        corpus.append(sentence)
        sentence = list()

    pp_text = corpus
    return pp_text



In [14]:
train,test =  holdout_text(text_data,train_pct=0.90)

In [15]:
train = fast_preprocessing(train,spwrds)
test = fast_preprocessing(test,spwrds)

In [96]:
from gensim import corpora
from gensim import models
from gensim.models import CoherenceModel

class LDA_Modelling:
    def __init__(self, param_lda):
        
        self.param_lda = param_lda
        
        
    def train(self,text_data):
        '''Funcao que tem o objetivo de retornar o modelo lda treinado dado os parâmetros
            Input : text_data : text na forma de sentencas tokenizadas
                    param_lda : dicionário de parâmetros do LDA'''
        
        #Instancia dicitonario e cria representacao bag of words
        self.dictionary = corpora.Dictionary(text_data)
        corpus_train = [self.dictionary.doc2bow(text) for text in text_data]

        # Constroi o modelo
        self.lda_model = models.LdaModel(corpus=corpus_train ,id2word=self.dictionary,iterations=200,
                                         **self.param_lda,minimum_probability=0)

    def show_topics(self):
        print("LDA Model:")

        for idx in range(self.param_lda['num_topics']):
            # Printa os 10 tópicos mais representativos
            print("Topic #%s:" % idx, self.lda_model.print_topic(idx, 10))

            
    def transform(self,some_text):
        if(type(some_text) == str):
            bow = self.dictionary.doc2bow(some_text)
        
        else:
            bow = [self.dictionary.doc2bow(sentences) for sentences in some_text]
        
        return self.lda_model[bow]

    
    def calculate_coeherence(self,text_data):
        coherence_model_lda = CoherenceModel(model=self.lda_model,
                                             texts=text_data, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

In [97]:
lda = LDA_Modelling(param_lda = {'num_topics' : 25})

In [98]:
lda.train(train[:1000])

In [99]:
lda.calculate_coeherence(train[:1000])

  if not i.flags.writeable or i.dtype not in (np.int32, np.int64):
  if not j.flags.writeable or j.dtype not in (np.int32, np.int64):



Coherence Score:  0.27957020466414745


In [82]:
transformed = lda.transform(test)

In [None]:
coherence_model_lda = CoherenceModel(model=self.lda_model,
                                             texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

In [83]:
topic_scores = list()
for topic in transformed:
    topic_scores.append(list(dict(topic).values()))

In [84]:
pd.DataFrame(topic_scores)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.006667,0.457642,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,...,0.006667,0.006667,0.006667,0.006667,0.389023,0.006667,0.006667,0.006667,0.006667,0.006667
1,0.004445,0.146817,0.004445,0.004445,0.004445,0.004445,0.004445,0.004445,0.004445,0.004445,...,0.004445,0.004445,0.121237,0.004445,0.004445,0.213084,0.004445,0.128878,0.004445,0.004445
2,0.005001,0.005001,0.131043,0.005001,0.290070,0.005001,0.005001,0.468864,0.005001,0.005001,...,0.005001,0.005001,0.005001,0.005001,0.005001,0.005001,0.005001,0.005001,0.005001,0.005001
3,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,...,0.006667,0.006667,0.006667,0.006667,0.006667,0.182570,0.006667,0.497431,0.006667,0.006667
4,0.172804,0.006672,0.006672,0.006672,0.006672,0.006672,0.006672,0.006672,0.006672,0.006672,...,0.006672,0.006672,0.006672,0.006672,0.006672,0.006672,0.006672,0.465823,0.006672,0.006672
5,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.840000,0.006667,...,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667
6,0.198119,0.002107,0.002107,0.002107,0.338985,0.076934,0.002107,0.002107,0.071367,0.078300,...,0.002107,0.055105,0.036975,0.002107,0.055673,0.002107,0.002107,0.002107,0.002107,0.054826
7,0.005718,0.571260,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718,...,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718,0.005718
8,0.004445,0.004445,0.004445,0.004445,0.249180,0.004445,0.004445,0.004445,0.004445,0.004445,...,0.004445,0.004445,0.004445,0.004445,0.004445,0.004445,0.648597,0.004445,0.004445,0.004445
9,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667,0.311481,0.006667,0.356605,...,0.006667,0.006667,0.006667,0.185241,0.006667,0.006667,0.006667,0.006667,0.006667,0.006667


In [62]:
lda.show_topics()

LDA Model:
Topic #0: 0.049*"produto" + 0.042*"qualidade" + 0.036*"foto" + 0.034*"lindo" + 0.032*"cor" + 0.026*"leve" + 0.026*"anuncio" + 0.023*"acabamento" + 0.021*"bem" + 0.021*"ficou"
Topic #1: 0.103*"produto" + 0.095*"expectativas" + 0.066*"atendeu" + 0.054*"atende" + 0.048*"bom" + 0.044*"gostei" + 0.039*"todas" + 0.038*"necessidades" + 0.031*"superou" + 0.029*"esperava"
Topic #2: 0.140*"facil" + 0.056*"bastante" + 0.053*"pratico" + 0.030*"bem" + 0.027*"usei" + 0.025*"obrigado" + 0.023*"gostei" + 0.022*"montar" + 0.019*"manuseio" + 0.014*"limpar"
Topic #3: 0.048*"valor" + 0.042*"presente" + 0.039*"comprei" + 0.036*"tanto" + 0.036*"frete" + 0.033*"filho" + 0.027*"adorou" + 0.023*"r" + 0.021*"p" + 0.020*"unica"
Topic #4: 0.153*"americanas" + 0.070*"sempre" + 0.060*"parabens" + 0.056*"lojas" + 0.052*"entrega" + 0.047*"produtos" + 0.042*"loja" + 0.022*"compro" + 0.021*"comprar" + 0.015*"expectativa"
Topic #5: 0.042*"uso" + 0.037*"marca" + 0.035*"comprei" + 0.024*"pra" + 0.022*"celular" 

In [73]:
%%time
from gensim import models, corpora
NUM_TOPICS = 50
dictionary = corpora.Dictionary(train)

corpus_train = [dictionary.doc2bow(text) for text in train]
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus_train, num_topics=NUM_TOPICS, id2word=dictionary,iterations=200)
 
# Build the LSI model
#lsi_model = models.LsiModel(corpus=corpus_train, num_topics=NUM_TOPICS, id2word=dictionary)

CPU times: user 1min 1s, sys: 1.12 s, total: 1min 3s
Wall time: 1min 1s


In [99]:
bow = dictionary.doc2bow(test.reset_index(drop = True)[0])
 
#print(lsi_model[bow])
 
print(lda_model[bow]) 

[(0, 0.13416106), (22, 0.16999689), (29, 0.16933051), (45, 0.37317678)]


In [64]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
# print("LSI Model:")
 
# for idx in range(NUM_TOPICS):
#     # Print the first 10 most representative topics
#     print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
# print("=" * 20)

LDA Model:
Topic #0: 0.056*"ninguem" + 0.053*"barato" + 0.029*"pior" + 0.028*"atraso" + 0.026*"porcaria" + 0.024*"passou" + 0.024*"absurdo" + 0.024*"faco" + 0.024*"diferenca" + 0.023*"telefone"
Topic #1: 0.076*"foto" + 0.071*"cor" + 0.067*"outro" + 0.054*"modelo" + 0.052*"diferente" + 0.039*"veio" + 0.037*"comprei" + 0.037*"igual" + 0.036*"produto" + 0.034*"totalmente"
Topic #2: 0.084*"jogo" + 0.074*"valor" + 0.058*"pequeno" + 0.057*"preco" + 0.039*"servico" + 0.039*"frete" + 0.038*"esquenta" + 0.035*"caro" + 0.031*"principalmente" + 0.029*"jogos"
Topic #3: 0.104*"pena" + 0.099*"vale" + 0.067*"todas" + 0.066*"imagem" + 0.053*"som" + 0.041*"linda" + 0.039*"top" + 0.035*"filha" + 0.032*"valeu" + 0.032*"perfeita"
Topic #4: 0.110*"cumpre" + 0.098*"realmente" + 0.085*"promete" + 0.042*"funcao" + 0.040*"produto" + 0.032*"processador" + 0.030*"moderno" + 0.028*"bem" + 0.028*"funcoes" + 0.027*"consegue"
Topic #5: 0.184*"prazo" + 0.128*"antes" + 0.111*"chegou" + 0.108*"produto" + 0.071*"entrega

In [65]:
%%time
tfidf = TfidfVectorizer(stop_words = spwrds,min_df = 10)
tfidf.fit(train)

#train = tfidf.transform(train)
#test = tfidf.transform(test)

AttributeError: 'list' object has no attribute 'lower'

In [24]:
mgp = MovieGroupProcess(K=100, alpha=0.1, beta=0.5, n_iters=20)

mgp.fit([[elements] for elements in train][:1000],
        len(tfidf.vocabulary_.keys()))

AttributeError: 'TfidfVectorizer' object has no attribute 'vocabulary_'

In [None]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 5)

In [266]:
mgp.cluster_word_distribution

[{'comprei ele, gastei com chip, recarga e nao funcionou. fica com a luz acesa e nao reconhece o chip. ligo e so da na caixa postal. nao funciona. decepcionado.': 1,
  'produto e bom, mas nao se engane achando que seja um ar condicionado': 1,
  'pela imagem ilustrativa achei que viria junto com o painel na embalagem, o suporte pra tv ate <NUM> polegadas! na minha opiniao em geral seria o correto esse item !': 1,
  'o produto nao reconheceu o chip. quanto as funcoes alguns funciona normalmente e outros nao.': 1,
  'embora tenha utilizado muito pouco, mas me parece ser um otimo produto. a entrega foi rapida. chegou tudo certinho. recomendo.': 1,
  'eu nao recebi meu produto.como voces querem que eu avalie minha compras.': 1,
  'excelente! eu recomendo!obrigado!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!': 1,
  'nao foi entregue minha mercadoria ,faz mais de trinta dias que estou esperando': 1,
  'a imagem para os canais e boa, mas para jogos e filmes on line e muito 

In [74]:
doc_count = np.array(mgp.cluster_doc_count)

In [75]:
# Show the top 5 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 5)

NameError: name 'top_words' is not defined

### Gibbs Sampling Dirichlet Mixture Model (GSDMM)

## NLP for Hackers

In [4]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/saraiva/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [5]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

500


In [6]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

In [7]:
# Build a Dictionary - association word to numeric id

 
# Transform the collection of texts to a numerical form


[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [8]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.005*"one" + 0.005*"would" + 0.004*"new" + 0.003*"could" + 0.003*"may" + 0.003*"said" + 0.003*"like" + 0.003*"first" + 0.002*"time" + 0.002*"man"
Topic #1: 0.006*"one" + 0.004*"could" + 0.003*"would" + 0.003*"said" + 0.003*"new" + 0.003*"like" + 0.002*"man" + 0.002*"also" + 0.002*"time" + 0.002*"first"
Topic #2: 0.004*"would" + 0.004*"one" + 0.003*"said" + 0.003*"time" + 0.003*"could" + 0.003*"man" + 0.003*"made" + 0.003*"may" + 0.002*"like" + 0.002*"new"
Topic #3: 0.008*"one" + 0.005*"would" + 0.003*"two" + 0.003*"new" + 0.003*"time" + 0.003*"said" + 0.002*"could" + 0.002*"made" + 0.002*"may" + 0.002*"man"
Topic #4: 0.005*"would" + 0.004*"one" + 0.003*"man" + 0.003*"could" + 0.003*"said" + 0.003*"two" + 0.002*"time" + 0.002*"first" + 0.002*"new" + 0.002*"may"
Topic #5: 0.006*"one" + 0.005*"would" + 0.003*"two" + 0.003*"time" + 0.003*"new" + 0.003*"could" + 0.003*"may" + 0.003*"said" + 0.003*"first" + 0.002*"even"
Topic #6: 0.007*"would" + 0.006*"one" + 0.004*"sai

In [9]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
 
print(lsi_model[bow])
 
print(lda_model[bow]) 

[(0, 0.0916150728946004), (1, -0.00882024847199351), (2, 0.015605852717606932), (3, -0.041374661887419825), (4, -0.016859229611194546), (5, -0.013309474618264558), (6, 0.029697399656179686), (7, -0.019693876663212286), (8, -0.05831659293107751), (9, -0.02626133750787087)]
[(0, 0.020014111), (1, 0.020013556), (2, 0.020012224), (3, 0.020012833), (4, 0.020013317), (5, 0.020015026), (6, 0.020013805), (7, 0.020013424), (8, 0.81987816), (9, 0.020013575)]


In [10]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])

[(49, 0.9983145), (29, 0.9979101), (162, 0.997615), (391, 0.9975316), (346, 0.99749357), (231, 0.99743426), (460, 0.99743426), (38, 0.99732935), (113, 0.99732935), (114, 0.99732935)]
The study of the St. Louis area's economic prospects prepared for the Construction Industry Joint Conference confirms and reinforces both the findings of the Metropolitan St. Louis Survey of 1957 and the easily observed picture of the Missouri-Illinois countryside . St. Louis sits in the center of a relatively slow-growing and in some places stagnant mid-continent region . Slackened regional demand for St. Louis goods and services reflects the region's relative lack of purchasing power . Not all St. Louis industries , of course , have a market area confined to the immediate neighborhood . But for those which do , the slow growth of the area has a retarding effect on the metropolitan core . The city has a stake in stimulating growth and purchasing power throughout outstate Missouri and Southern Illinois . G

In [11]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())
 


[0.02500004 0.02500009 0.77496819 0.02500002 0.02500901 0.02500015
 0.02500156 0.02500411 0.02500587 0.02501094] 1.0
