In [1]:
# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from time import time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Defining parameters
n_features = 10000
n_components = 15
n_top_word = 7

In [3]:
# Function to print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [4]:
# Loading Dataset
print("Loading dataset...")
dataset = pd.read_csv('comments-ita-negative.csv', engine='python')
data_samples = dataset.iloc[1:,0]
n_samples = len(data_samples)
dataset.head()

Loading dataset...


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Author,Published,Comment,Likes,Dislikes,Article,Keyword,Year,Month,Day,Sentiment
0,4,4,'Wellington Oliveira,2016-05-06T11:05:33,ets visitam terra nada s pr prios humanos futu...,39,11,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Negative
1,10,10,'Comentador,2016-05-06T12:35:12,j est construindo rob matar todas pessoas fome...,17,1,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Negative
2,13,13,'Gabriel Zanon,10 Set 2013 13:06:37,anailton joeberger verdade cara quer pagar pal...,0,0,Nasa lanÃƒÆ’Ã‚Â§a perfil no Instagram,NASA;;;;;;;;;;;;;;;;;',2013,10,18,Negative
3,27,27,'Sara Pavan,2011-11-04T11:02:23,sim chegando falta dias virada ano nisso acho ...,0,0,Nasa registra mancha solar gigante,NASA;;;;;;;;;;;;;;;;;',2011,11,4,Negative
4,30,30,'Willian F,2011-11-07T15:22:14,pessoas confundem manchas solares sinais fim m...,0,0,Nasa registra mancha solar gigante,NASA;;;;;;;;;;;;;;;;;',2011,11,7,Negative


In [5]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tfidf = tfidf_vectorizer.fit_transform(dataset['Comment'])

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [6]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tfidf = tfidf_vectorizer.fit_transform(dataset['Comment'])

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [7]:
# Use tf features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tf = tf_vectorizer.fit_transform(dataset['Comment'])

print("tf features for LDA extraction is completed!")

Extracting tf features for LDA...
tf features for LDA extraction is completed!


In [8]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=11992 and n_features=10000...
done in 3.430s.

Topics in NMF model (Frobenius norm):
Topic #0: es milh bilh ladr nada manifesta condi
Topic #1: povo brasileiro vamos ruas congresso contra fazer
Topic #2: pra ter fazer ir gente ver tudo
Topic #3: est pois poder sendo governo porque fazendo
Topic #4: brasil pt pais aqui vergonha ca tudo
Topic #5: avi fab piloto ainda usar renan uso
Topic #6: ncia rio ter ser tudo at mundo
Topic #7: pa corrup ses outros onde aqui eua
Topic #8: vai ser agora nada pagar fazer acabar
Topic #9: pol ticos todos tico tica corruptos poder
Topic #10: nao nada so porque ja sao ser
Topic #11: cara vergonha pau ainda ser renan falta
Topic #12: terra deus planeta lua aqui vida plana
Topic #13: dinheiro blico devolver pagar publico crime governo
Topic #14: voc ria aqui porque sabe quer onde





In [9]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=11992 and n_features=10000...
done in 9.309s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: es dia pessoas nada contra todos cada
Topic #1: povo vamos brasileiro ruas congresso todos renan
Topic #2: pra nao vc ter ai ver pode
Topic #3: deus est israel pois ncia espa fim
Topic #4: brasil pt pa pena eua aqui piada
Topic #5: est pra avi lia governo fazendo fab
Topic #6: avi fab vai piloto agora pagar ser
Topic #7: rio vel ter fab ser cio ncia
Topic #8: vai pa ses pr lixo mundo ria
Topic #9: pol ticos tico ncia vel tica todos
Topic #10: es nada nao tudo ladr dinheiro devolver
Topic #11: cara vergonha ncia pau ainda falta deveria
Topic #12: terra planeta lua aqui marte homem ainda
Topic #13: dinheiro blico pr governo enquanto fazer avi
Topic #14: voc tudo ria morreu aqui rios piloto



In [10]:
# Fit the LDA model
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

Fitting LDA models with tf features, n_samples=11992 and n_features=10000...
done in 20.617s.

Topics in LDA model:
Topic #0: fab armadas militar morrer militares servi sal
Topic #1: lia fam mar matar deputados stf senadores
Topic #2: pol ticos rio cara ladr pau ruas
Topic #3: brasil vai pra povo dinheiro pa governo
Topic #4: israel sr mentira drogas soldados palestinos destruir
Topic #5: piloto avi morreu aeronave voo publico errado
Topic #6: nao est ter deus pode coisa contra
Topic #7: terra congresso estar pra hora deve tempo
Topic #8: globo caiu canalha empresa cabelo pilotos burro
Topic #9: erro fim helic ptero pessoal noticia decis
Topic #10: usar derrubar aeronaves novos fazenda perde fronteira
Topic #11: es est ncia vel avi falta pa
Topic #12: vergonha povo cara vamos renan acidente brasil
Topic #13: pena nome devolve porta jogo tava venezuelanos
Topic #14: ns parab uso ia tico concurso fotos



In [11]:
# Fit the LSA model
print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

Fiting LSA model

Topics in LSA model:
Topic #0: est es povo brasil pra vai pa
Topic #1: povo pol vai dinheiro nao cara brasil
Topic #2: pra vai nao terra ter deus ser
Topic #3: es avi fab brasil pra pol milh
Topic #4: brasil pa nao aqui mundo rio ses
Topic #5: est pra avi brasil fab pol ticos
Topic #6: vai avi fab cara dinheiro est vergonha
Topic #7: pa rio ncia avi pol fab dinheiro
Topic #8: vai pa pra es pol voc corrup
Topic #9: pa nao est povo nada es contra
Topic #10: dinheiro nao cara es vergonha pol tudo
Topic #11: nao pol ticos dinheiro todos fab avi
Topic #12: ncia rio dinheiro ci contra voc brasil
Topic #13: dinheiro terra avi brasil blico povo pa
Topic #14: voc dinheiro tudo ter porque ria aqui

