In [1]:
# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from time import time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Defining parameters
n_features = 10000
n_components = 15
n_top_word = 7

In [3]:
# Function to print topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [4]:
# Loading Dataset
print("Loading dataset...")
dataset = pd.read_csv('comments-ita-neutral.csv', engine='python')
data_samples = dataset.iloc[1:,0]
n_samples = len(data_samples)
dataset.head()

Loading dataset...


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Author,Published,Comment,Likes,Dislikes,Article,Keyword,Year,Month,Day,Sentiment
0,0,0,'Paulo Fernandes,2016-05-06T14:14:23,kg equipamentos armadura kg bateria,17,1,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Neutral
1,1,1,'Mirage Tf,2016-05-06T16:54:39,v fazer igual outro funciona luz solar assim a...,4,0,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Neutral
2,6,6,'Paulo Fernandes,2016-05-06T14:12:10,criou at novo idioma hein portugueis kkkkk,8,0,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Neutral
3,7,7,'Natanlp,2016-05-06T13:02:49,reator ark peito timo marketing,20,1,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Neutral
4,8,8,'Paulo Fernandes,2016-05-06T14:08:59,enquanto s marketing precisa cordinhas pra n c...,1,0,Nasa apresenta robÃƒÆ’Ã‚Â´ humanoide \'astrona...,NASA;;;;;;;;;;;;;;;;;',2016,5,6,Neutral


In [8]:
dataset = dataset.dropna(axis=0)

In [9]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tfidf = tfidf_vectorizer.fit_transform(dataset['Comment'])

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [10]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tfidf = tfidf_vectorizer.fit_transform(dataset['Comment'])

print("tf-idf features extracted!")

Extracting tf-idf features for NMF...
tf-idf features extracted!


In [11]:
# Use tf features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stopwords.words('portuguese'))
tf = tf_vectorizer.fit_transform(dataset['Comment'])

print("tf features for LDA extraction is completed!")

Extracting tf features for LDA...
tf features for LDA extraction is completed!


In [12]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=19821 and n_features=10000...
done in 4.198s.

Topics in NMF model (Frobenius norm):
Topic #0: sai ja caro entra agora desse querem
Topic #1: ser pode vergonha brasileiro deve deveria cara
Topic #2: lula papuda dilma pt fhc familia corruptos
Topic #3: hipocresia rodrigo pura concordo vc zuero elon
Topic #4: renan safado calheiros alves cara collor gente
Topic #5: vai dilma cuidar ei sus pagar dinheiro
Topic #6: pra presidente silva fazer dinheiro ir la
Topic #7: es bilh milh ladr elei pr pa
Topic #8: est pa onde porque ainda pt todos
Topic #9: brasil pa aqui vamos pt dinheiro ca
Topic #10: povo brasileiro dinheiro ladr vamos cara ruas
Topic #11: rio ter nada ncia nao tudo deus
Topic #12: avi fab dinheiro pagar piloto rea ca
Topic #13: pol ticos tico tica pa todos corruptos
Topic #14: voc hoje sim direitos pa ir cidad





In [13]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_word)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=19821 and n_features=10000...
done in 14.723s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: ja espa sai la querem cabe sera
Topic #1: ser mundo pode at ter pr anos
Topic #2: lula dilma pt papuda viva fhc corruptos
Topic #3: vc concordo hipocresia pura rodrigo tudo acha
Topic #4: renan safado calheiros gente ladr alves vamos
Topic #5: vai ter ser dilma pode pagar ficar
Topic #6: pra presidente fazer ta ver ir kkkkkk
Topic #7: es avi milh ca bilh ladr elei
Topic #8: est ria porque onde ainda mat pt
Topic #9: brasil pa pt aqui nao pais lan
Topic #10: povo pol ticos dinheiro vamos todos brasileiro
Topic #11: deus rio nada nasa lua vida tudo
Topic #12: avi fab nao dinheiro piloto tamb pagar
Topic #13: ncia vel rios pol pa coment boa
Topic #14: voc cara parab ns sim brasileiro povo



In [14]:
# Fit the LDA model
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_word)

Fitting LDA models with tf features, n_samples=19821 and n_features=10000...
done in 24.462s.

Topics in LDA model:
Topic #0: assim piloto ser est voc vez todos
Topic #1: brasil rio presidente pagar dilma vai quer
Topic #2: lula congresso militar sao bandido cima paz
Topic #3: vel ria rea guerra apenas caso mat
Topic #4: avi ai ver parab ns senado sai
Topic #5: pt povo sempre so kkkk at kkkkk
Topic #6: ter ca ainda vai outra pode kkk
Topic #7: coisa boa concordo safado kkkkkk anos viva
Topic #8: nao vergonha brasileiro tico estado ser fica
Topic #9: es pol pra agora renan ncia ticos
Topic #10: vai rios pais ar come tudo acabar
Topic #11: est nada deus nunca vc governo tica
Topic #12: pra aqui outros sabe pa dinheiro pois
Topic #13: fab pa fez copa norte farra federal
Topic #14: cara vamos faz desse verdade ladr brasil



In [15]:
# Fit the LSA model
print("Fiting LSA model")

lsa = TruncatedSVD(n_components=n_components, n_iter=40, tol=0.01)

lsa.fit(tf)

print("\nTopics in LSA model:")

print_top_words(lsa, tf_feature_names, n_top_word)

Fiting LSA model

Topics in LSA model:
Topic #0: sai pra vai ja renan agora povo
Topic #1: pra es est vai brasil ser avi
Topic #2: lula papuda renan safado dilma fhc familia
Topic #3: hipocresia vc concordo pura rodrigo lula papuda
Topic #4: renan safado calheiros alves collor henrique mara
Topic #5: vai dilma cuidar sus ei ser pagar
Topic #6: pra vai presidente silva dilma cuidar ei
Topic #7: es avi fab bilh milh dilma ladr
Topic #8: ser brasil brasileiro pode vergonha povo nao
Topic #9: ser es avi est voc pode ncia
Topic #10: povo pol ticos brasileiro dinheiro tica tico
Topic #11: ser est pol brasil povo brasileiro presidente
Topic #12: es voc pa dilma cuidar presidente ei
Topic #13: pol ticos voc avi fab ncia dilma
Topic #14: voc povo avi brasileiro fab dilma ei

