In [15]:
# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from time import time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Defining parameters
n_features = 10000
n_components = 15
n_top_word = 7

In [17]:
# Function to print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [18]:
# Loading Dataset
print("Loading dataset...")
dataset = pd.read_csv('comments-ita-positive.csv', engine='python')
data_samples = dataset.iloc[1:,0]
n_samples = len(data_samples)
dataset.head()

Loading dataset...


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Author,Published,Comment,Likes,Dislikes,Article,Keyword,Year,Month,Day,Sentiment
0,2,2,'Rodrigo Braga,2016-05-06T13:43:52,kkkkkk posso ir levo super nintendo prometo,20,1,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Positive
1,3,3,'Tucanofdp,2016-05-06T14:12:40,voc ser bem vindo rodrigo n s sobreviventes rd...,15,0,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Positive
2,5,5,'Marcele Veiga,2016-05-06T11:18:17,engano m ximo hierarquia celeste ser humano po...,2,33,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Positive
3,11,11,'Comentador,2016-05-06T10:22:59,pois ficou parecendo mistura v rios personagen...,12,0,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Positive
4,24,24,'LisÃƒÆ’Ã‚Â©rgico D,2011-11-04T10:47:28,previsto nasa maior atividade solar ltimos tempos,0,0,Nasa registra mancha solar gigante,NASA;;;;;;;;;;;;;;;;;',2011,11,4,Positive


In [21]:
# Use TF-IDF features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tfidf = tfidf_vectorizer.fit_transform(dataset['Comment'])

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [22]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tfidf = tfidf_vectorizer.fit_transform(dataset['Comment'])

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [23]:
# Use tf features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tf = tf_vectorizer.fit_transform(dataset['Comment'])

print("tf features for LDA extraction is completed!")

Extracting tf features for LDA...
tf features for LDA extraction is completed!


In [24]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=10183 and n_features=10000...
done in 2.913s.

Topics in NMF model (Frobenius norm):
Topic #0: brasil aqui precisa mundo todos governo pt
Topic #1: ser deve pode humano deveria vida pois
Topic #2: pra fazer aqui ir ver dinheiro dar
Topic #3: lenga ticos pol poder ser vai nada
Topic #4: deus voc existe todos vida aqui universo
Topic #5: es milh bilh vida anos lares condi
Topic #6: vai pagar dinheiro agora ter dar nada
Topic #7: est todos tudo at voc onde agora
Topic #8: rio sal coment voc rios blico dinheiro
Topic #9: bem tudo ainda ter pois assim melhor
Topic #10: povo pol ticos brasileiro tica tico dinheiro
Topic #11: avi fab usar direito uso pagar ca
Topic #12: ncia ci vel tecnologia vida ria ainda
Topic #13: nao nada vc pais sao ja ai
Topic #14: pa ses mundo ter outros melhor vel





In [25]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=10183 and n_features=10000...
done in 8.998s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: es pessoas brasil pr ser aqui assim
Topic #1: ser deve pode humano deveria poderia devem
Topic #2: pra fazer bom dar aqui ver cara
Topic #3: pol brasil ticos pa poder tica tico
Topic #4: deus voc ns parab existe todos tudo
Topic #5: es vida terra anos planeta universo vel
Topic #6: vai pagar dinheiro agora ter valor sair
Topic #7: est onde rio bom ser rios agora
Topic #8: rio fab lia direito ter ir rios
Topic #9: bem ainda ter feito tudo espa melhor
Topic #10: povo vamos todos renan brasileiro cara tudo
Topic #11: avi fab brasil ca super piloto militar
Topic #12: ncia ci ainda tecnologia presidente favor senado
Topic #13: nao vc nada rio pais ja quer
Topic #14: pa mundo melhor nada israel guerra vel



In [26]:
# Fit the LDA model
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

Fitting LDA models with tf features, n_samples=10183 and n_features=10000...
done in 18.462s.

Topics in LDA model:
Topic #0: reforma tico regi trump es rela forte
Topic #1: pa ses paga impostos rio ncia tica
Topic #2: avi fab ca federal aeronave es governo
Topic #3: super ministro rep calheiros tria cio lula
Topic #4: ser rio deus est dinheiro pol es
Topic #5: ser pra brasil vai povo es ter
Topic #6: serve intelig pilantra certos movimento senhora mau
Topic #7: israel bom direito nome henrique imprensa presidente
Topic #8: sat helic investigar pau la ptero lite
Topic #9: viagem ajudar publico deputados bem fab junto
Topic #10: novidade comercial venezuelanos calma gosto kkkk sra
Topic #11: passagens aeronaves bolso pago piloto oficiais brincadeira
Topic #12: ser renan pt voc fab ns parab
Topic #13: sabia festa stico gostei salario fant simplesmente
Topic #14: lia bras queremos deputado favor hospital prefeito



In [27]:
# Fit the LSA model
print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

Fiting LSA model

Topics in LSA model:
Topic #0: ser es est vai pra brasil rio
Topic #1: ser deve pode humano deveria lenga ateu
Topic #2: pra vai nao fazer vc nada dinheiro
Topic #3: lenga pol vai ticos brasil poder povo
Topic #4: deus est ncia lenga bem voc rio
Topic #5: lenga es pra deus ncia vida avi
Topic #6: pra brasil lenga pa pol ser ticos
Topic #7: deus brasil vai nao es ncia vida
Topic #8: rio fab bem avi ncia pol ter
Topic #9: deus povo pol ticos rio voc brasileiro
Topic #10: povo bem deus nao pol avi brasileiro
Topic #11: avi fab brasil deus lenga est ca
Topic #12: ncia povo ci avi brasil ainda pol
Topic #13: nao rio povo lenga nada vc ncia
Topic #14: rio bem brasil deus pra povo dinheiro

