In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from time import time
import re

n_features = 10000
n_components = 10
n_top_word = 7

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print("Loading dataset...")

dataset = pd.read_csv('https://raw.githubusercontent.com/Matheusadler/Sentiment-Analysis-Application/master/CSV%20Cleaner/g1_clean%20CLA.csv', sep=',', encoding='UTF-8')

data_samples = dataset.iloc[1:,0]

n_samples = len(data_samples)

#Loading StopWords
from io import open
with open("preprocessamento\essenciais\stopwords_pt_nltk.txt","r",encoding='utf-8') as a:
    StopWords = a.readlines()
    StopWords = [w.replace('\n', '') for w in StopWords]


print("Dataset loaded!")

clean_comments = []
for w in range(len(dataset.Comment)):
  comment = dataset['Comment'].iloc[w]

  # remove special characters and digits
  comment  = re.sub("(\\d|\\W)+|\w*\d\w*"," ",comment )
  comment = ' '.join(s for s in comment.split() if (not any(c.isdigit() for c in s)) and len(s) > 2)
  clean_comments.append(comment)

clean_comments[0:5]

print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=StopWords)
tfidf = tfidf_vectorizer.fit_transform(clean_comments)

print("tf-idf features extracted!")

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=StopWords)
tf = tf_vectorizer.fit_transform(clean_comments)

print("tf features for LDA extraction is completed!")

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

  and should_run_async(code)
  with open("preprocessamento\essenciais\stopwords_pt_nltk.txt","r",encoding='utf-8') as a:
  comment  = re.sub("(\\d|\\W)+|\w*\d\w*"," ",comment )


Loading dataset...
Dataset loaded!
Extracting tf-idf features for NMF...
tf-idf features extracted!
Extracting tf features for LDA...
tf features for LDA extraction is completed!
Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=186 and n_features=10000...
done in 0.052s.

Topics in NMF model (Frobenius norm):
Topic #0: vai corolla povo carro coisa ainda gente
Topic #1: bmw cla motor mercedes preço opinião demais
Topic #2: mundo classe total fiasco estrelinha resultado mercedes
Topic #3: tração audi dianteira melhor ainda traseira gente
Topic #4: anda etc acho fusion boa azera conjunto
Topic #5: cla chega versão meio mercedes alto vai
Topic #6: anos ocorrida tragédia após cla lançado lançamento
Topic #7: mil preço comprar melhor deve caro mercedes
Topic #8: lucro querem imposto porte ganhar custo alem
Topic #9: carro mercedes desse igual carros custa todo

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=186 and n_feat

  'stop_words.' % sorted(inconsistent))


done in 0.190s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: vai carro custa povo pior governo coisa
Topic #1: bmw caro preço cla motor mercedes todos
Topic #2: mundo todo classe mercedes ver menos fiasco
Topic #3: audi tração ainda dianteira gente muita melhor
Topic #4: anda conjunto frente opinião mecânica ponto carros
Topic #5: verdade cla falem difícil chega boa meio
Topic #6: cla anos lançado nunca agora ocorrida lançamento
Topic #7: mil comprar preço melhor preços vou uns
Topic #8: civic querem lucro ganhar compra custo comprar
Topic #9: carro mercedes desse carros etc pobre igual

Fitting LDA models with tf features, n_samples=186 and n_features=10000...
done in 0.258s.

Topics in LDA model:
Topic #0: comprar dinheiro cadenza azera motor equipado ambos
Topic #1: povo civic governo coisa difícil ver gustavo
Topic #2: carro igual mercedes desse mim benz pro
Topic #3: carro mil tração comprar deve preços devia
Topic #4: corolla lucro carro vai grande co

In [7]:
pyLDAvis.sklearn.prepare(lda, tfidf, tf_vectorizer, sort_topics=False, mds = 'tsne')

  and should_run_async(code)
