<a href="https://colab.research.google.com/github/HedersonSantos/Noticias/blob/main/clusterizacaoNoticias_w2v.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

import nltk, re
from nltk import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import AgglomerativeClustering

# Obtendo dataset de noticias tratadas em preProcessamento_noticias

In [2]:
!rm news.*
!wget https://raw.githubusercontent.com/HedersonSantos/Noticias/main/news.zip
!unzip news.zip
%ls

rm: cannot remove 'news.*': No such file or directory
--2021-06-20 18:34:30--  https://raw.githubusercontent.com/HedersonSantos/Noticias/main/news.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11333903 (11M) [application/zip]
Saving to: ‘news.zip’


2021-06-20 18:34:31 (18.4 MB/s) - ‘news.zip’ saved [11333903/11333903]

Archive:  news.zip
  inflating: news.csv                
news.csv  news.zip  [0m[01;34msample_data[0m/


# Funções para processamento de Linguagem Natural

In [39]:
def download_pt_stopWords():

  '''download das stopwords '''
  nltk.download('stopwords') #stopwords
  nltk.download('rslp')  #stemming
  nltk.download('punkt')  #tokenizacao
        
def removeStopWords( texto, excluirWords:list=None):
  '''remove as stopwords do texto. Novas stopwords podem ser adicionadas através da lista excluirWords'''
  naoQueridas = nltk.corpus.stopwords.words('portuguese')
  if not excluirWords==None:
    naoQueridas.extend(excluirWords)
  naoQueridas = list(set(naoQueridas))
  palavras = [i for i in texto.split() if not i.lower() in naoQueridas]
  return (" ".join(palavras))

def aplicaStemming( texto):
  ''' obtém o radical das palavras do vocabulário'''
  stemmer = nltk.stem.RSLPStemmer()
  palavras = []
  for w in texto.split():
      palavras.append(stemmer.stem(w))
  return (" ".join(palavras))

def transformaTextoMinuscula(texto):
  palavras=[]
  for w in texto.split(" "):
    palavras.append(w.lower())
  return (" ".join(palavras))


def removeCaracteresNaoDesejados(texto, pontuacao=False):
  textoLimpo = re.sub(r"http\S+", "", texto)
  textoLimpo = re.sub(r"www\..+\..+", "", textoLimpo)
  textoLimpo = re.sub(r"[^a-zA-ZáÁéÉíÍóÓúÚãÃàÀôâÂêÊôÔçÇ!,:.; ]", "", textoLimpo)
  if pontuacao==True:
    textoLimpo = re.sub(r'[^\w\s]', '',textoLimpo)
  return textoLimpo

def retornaVetorizacao(X,pct_min=1, pct_max=1, excluirSW:list=None):
  ''' monta a matriz sparsa com o índice de vocabulário em cada texto. 
    Retorna a matriz sparsa e o vocabulário '''
  count_vect = CountVectorizer(min_df=pct_min, max_df=pct_max, lowercase=True,stop_words=excluirSW) 
  matriz_sparsa = count_vect.fit_transform(X)
  vocabulario = count_vect.fit(X)
  return [matriz_sparsa,vocabulario]

def retornaMatriztfIdf( V):
    ''' em cada documento, calcula o tf-idf de cada palavra
        term frequency - inverse document frequency'''
    tfidf_transformer = TfidfTransformer()
    matriz_tfidf = tfidf_transformer.fit_transform(V)
    return matriz_tfidf

def normalizaEreduzDimensionalidadecomPCA( X, nro_dimensao,UT=None):
    data_pca= TruncatedSVD(nro_dimensao)
    if UT==None:
        UT = data_pca.fit(X)
    X_pca =  UT.fit_transform(X)
      
    return [UT,X_pca]

def padronizaValores(X):
  X_norm = StandardScaler(with_mean=False).fit_transform(X)
  return X_norm

def retornaPalavras(listaTexto:list):
  words=""
  for i in listaTexto: 
    i = str(i) 
    separate = i.split() 
    for j in range(len(separate)): 
        separate[j] = separate[j].lower() 
  words += " ".join(separate)+" "
  return words
def montaWordCloud(words, n_palavras:int):
  wc = WordCloud(width = 400, height = 200, 
                background_color ='black', max_words=n_palavras,
                min_font_size = 10).generate(words)
  return wc

def imprimiWordCloud(listaTexto:list, n_palavras:int, cluster:str=None):
  words = retornaPalavras(listaTexto)
  wc = montaWordCloud(words, n_palavras)
  plt.figure(figsize = (8, 8), facecolor = None) 
  plt.imshow(wc) 
  plt.axis("off") 
  plt.tight_layout(pad = 0) 
  if not cluster == None:
    plt.title('PALAVRAS PARA O CLUSTER ' + cluster)
  plt.show()

def tokenizaporSentenca(text):
  '''divide o texto em sentenças. Para cada sentença cria uma lista de palavras sem a pontuação. '''
  sentencas = text.split(".")
  tk = [word_tokenize(re.sub(r'[^\w\s]', '',s)) for s in sentencas if len(word_tokenize(re.sub(r'[^\w\s]', '',s)))>0  ]
  return tk

def tokenizaTexto(text):
  ''' usa todo o texto para tokenizacao '''
  tk = word_tokenize(re.sub(r'[^\w\s]', '',text))
  return tk


def modelaWord2Vec(tokens_docs, tam_vetor, nro_seed):
  model = Word2Vec(tokens_docs, size=tam_vetor,sg=1,min_count=1)
  return model

def top_tfidf_feats(row, terms, top_n=25):
    top_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [terms[i] for i in top_ids]
    return top_feats

def extract_tfidf_keywords(texts, min_df=1, max_df=1.0,top_n=100):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True,  ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    terms = tfidf_vectorizer.get_feature_names()
    arr = []
    for i in range(0, tfidf_matrix.shape[0]):
        row = np.squeeze(tfidf_matrix[i].toarray())
        feats = top_tfidf_feats(row, terms, top_n)
        arr.append(feats)
    return arr


# Prepara dataset para clusterizacao

In [10]:
#abrir arquivo news.csv
dfDados = pd.read_csv('news.csv')
dfDados['TEXT_TRATADO'] = dfDados['TEXT_TRATADO'].astype('unicode')
dfDados['TEXT_TRATADO'] = dfDados.loc[:,['TEXT_TRATADO']].apply(lambda x: removeCaracteresNaoDesejados(x['TEXT_TRATADO']),axis=1)
download_pt_stopWords()
dfDados['TEXT_TRATADO'] = dfDados.loc[:,['TEXT_TRATADO']].apply(lambda x: removeStopWords(x['TEXT_TRATADO']),axis=1)
dfDados['TEXT_TRATADO'] = dfDados.loc[:,['TEXT_TRATADO']].apply(lambda x: transformaTextoMinuscula(x['TEXT_TRATADO']),axis=1)
dfDados['TEXT_TRATADO'] = dfDados.loc[:,['TEXT_TRATADO']].apply(lambda x: aplicaStemming(x['TEXT_TRATADO']),axis=1)
dfDados[['TEXT_TRATADO']].head(3)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,TEXT_TRATADO
0,sext bbb tv glob broth receb com beb faz aquec...
1,"program viv bbb tv globo, leifert convers empa..."
2,"continu dizendo, deu tir daquel cadeira. vai s..."


In [41]:
#tokeniza 
X = dfDados['TEXT_TRATADO'].values
X_words = []
for i in range(0,X.shape[0],1):
  X_tk = tokenizaTexto(X[i])
  if len(X_tk)>0:
    X_tfidf = extract_tfidf_keywords(X_tk, top_n=25)
    X_words.append(X_tfidf) 

#modela Word2Vec para X_tk  
#modelW2V = Word2Vec(X_tk, size=50,sg=1,min_count=1)

In [57]:
X = dfDados['TEXT_TRATADO'].values
tfidf_vectorizer = TfidfVectorizer(use_idf=True,  ngram_range=(1,3), tokenizer=tokenizaTexto, )
tfidf_matrix = tfidf_vectorizer.fit_transform(X)
terms = tfidf_vectorizer.get_feature_names()
arr = []
for i in range(0, tfidf_matrix.shape[0]):
    row = np.squeeze(tfidf_matrix[i].toarray())
    feats = top_tfidf_feats(row, terms, top_n)
    arr.append(feats)


In [82]:
modelW2V = Word2Vec(arr, size=100,sg=1,min_count=1)

In [73]:
stemmer = nltk.stem.RSLPStemmer()
print(modelW2V.wv.similarity(stemmer.stem('brasil'),stemmer.stem('futebol')))


0.21539998


7152

In [86]:

doc2vecs = []
for i in range(0, len(arr)):
    vec = [0 for k in range(100)] 
    for j in range(0, len(arr[i])):
        if arr[i][j] in modelW2V:
            vec += modelW2V[arr[i][j]]
    doc2vecs.append(vec)
            
    

  
  import sys


In [87]:
doc2vecs

[array([-1.24097695e+00,  2.47081409e+00,  6.28564068e-01,  1.29463708e+00,
        -3.92810436e+00,  6.05404561e-01,  5.08700751e-01,  2.31832090e+00,
        -1.39391701e+00, -1.92124397e-01,  1.22503767e+00, -1.81854648e+00,
        -5.80129127e-01, -7.58594903e-01,  3.77729387e+00,  7.91506295e-01,
         5.20315642e-01,  1.35047565e+00, -1.72963234e+00, -1.29029078e+00,
        -1.72253483e+00,  5.06690122e-02, -6.58821111e-01,  1.62994436e+00,
        -3.64693307e-02,  8.73643252e-01,  4.22333492e-01, -2.22082777e+00,
         1.24246635e-01, -9.48051230e-01, -3.20602206e-01, -6.99671840e-02,
         3.67630923e+00,  9.68956485e-01, -1.47805705e+00, -2.59472997e+00,
         1.35445899e+00,  1.19827441e+00,  1.52936725e-03,  3.48883546e+00,
         2.60249264e+00,  1.02952780e+00, -2.83841897e+00, -2.21181015e+00,
         1.27541955e+00,  2.93401682e+00, -2.64529452e-01,  2.97588115e-01,
         4.45235871e-01, -1.07089849e+00, -4.71601876e-01, -1.19138226e+00,
        -7.8

In [None]:
linkage = ['ward','complete','single','average']
distancia = ['euclidean', 'manhattan'] #, 'cosine'
d_tsh = [8,10,12]
dictResultado = {}
dictModel = {}

for l in linkage:
  for d in distancia:
    for n in d_tsh:
      if l=='ward' and d!='euclidean':
        continue
      model = AgglomerativeClustering(linkage=l, affinity=d, distance_threshold=n, n_clusters=None)
      model = model.fit(X_tfidf.toarray())
      dictResultado[l + '_' + d + '_' + str(n)] = model.n_clusters_
      dictModel[l + '_' + d + '_' + str(n)] = model


In [None]:
dictResultado

In [None]:
model = dictModel['ward_euclidean_12']
print('nro cluster:',model.n_clusters_)
distancia = model.distances_
print('distância minima:', distancia.min(), '\n distância máxima:', distancia.max())

In [None]:
y_pred = model.fit_predict(X_tfidf.toarray())
dfDados['y_pred'] = y_pred
dfDados.groupby(['y_pred'])['y_pred'].count()

In [None]:
for y in np.sort(dfDados['y_pred'].unique()):
  print('Top 10 categorias de y_pred = ', y)
  print(dfDados[dfDados['y_pred']==y].groupby('CATEGORIA')['URL'].count().reset_index().sort_values(by='URL',ascending=False)[0:10])


In [None]:
for y in np.sort(dfDados['y_pred'].unique()):
  dftmp = dfDados.loc[:,['TEXT_TRATADO']][dfDados['y_pred']==y]
  imprimiWordCloud(dftmp['TEXT_TRATADO'].tolist(),100,str(y))

In [None]:
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster import hierarchy
Z = hierarchy.linkage(model.children_, 'ward')
plt.figure(figsize=(20,10))
dn = hierarchy.dendrogram(Z)