In [1]:
import pandas as pd
import numpy as np
import bs4, requests, re

import nltk
from unicodedata import normalize
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import  cosine_similarity


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/rafjaa/curso-mineracao-de-dados-aplicada/master/data/df_noticias_if.csv')

In [3]:
df.head()

Unnamed: 0,data,titulo,conteudo
0,14/09/2017,Campus Barbacena divulga Resultado Provisório ...,\n\n\tO Campus Barbacena divulgou o Resultado ...
1,14/09/2017,Divulgado o Edital de convocação de assembleia...,\n\n\tDivulgado o Edital de convocação de asse...
2,14/09/2017,Pesquisador da Bélgica realiza palestra no Cam...,"\n\n\tO pesquisador da Bélgica, Luc Vankrunkel..."
3,14/09/2017,Divulgada a homologação das inscrições à candi...,\n\n\tDivulgada a homologação das inscrições à...
4,14/09/2017,"Aprovado Regulamento de Eventos, Cerimonial e ...","\n\n\tO Regulamento, aprovado no dia 05 de set..."


In [4]:
# Concatenando título e conteúdo
df['doc'] = df.apply(lambda row: row['titulo'] + row['conteudo'], axis=1)

In [5]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords = normalize('NFKD', ' '.join(stopwords)).encode('ASCII', 'ignore').decode().split()

stemmer = nltk.stem.RSLPStemmer()

def processar_texto(texto):
    if texto is None or not texto:
        return ''

    # Trasnformação do texto em minúsculo e remoção de termo
    texto = texto.lower().replace('\nleia o documento\n', '')

    # Removendo acentuação
    texto = normalize('NFKD', texto).encode('ASCII', 'ignore').decode()

    # Removendo Pontuação, stopwords, palavras com número e aplicando stemming
    texto = ' '.join([stemmer.stem(c) for c in nltk.word_tokenize(texto) 
                      if (c not in punctuation) and (c not in stopwords) and not (re.match('.*[\d_].*', c)) ])

    return texto


In [6]:
df['doc'] = df['doc'].apply(processar_texto)

In [7]:
vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 1),
    max_features=None,
    binary=False,
    use_idf=True
)

tfidf_matrix = vectorizer.fit_transform(df['doc'])

In [8]:
print(vectorizer.get_feature_names()[:100])

['aba', 'abaix', 'abaixo', 'abanc', 'abandon', 'abdin', 'abert', 'abior', 'abn', 'abord', 'abp', 'abr', 'abrac', 'abramal', 'abrang', 'abrangenc', 'abranj', 'abrig', 'abril', 'abrilhant', 'absorv', 'abstrat', 'aca', 'acab', 'academ', 'academc', 'academic', 'academico', 'academicos', 'acamp', 'acas', 'aceit', 'aceitaca', 'acejf', 'acentu', 'acerc', 'acert', 'acerv', 'acess', 'acessi', 'acessibil', 'acessori', 'ach', 'acid', 'acido', 'acim', 'acion', 'aco', 'acolh', 'acolhe', 'acomnpanh', 'acompanh', 'aconselh', 'acontec', 'acontecera', 'aconteceu', 'acord', 'acostum', 'acr', 'acredit', 'acrescent', 'acumul', 'acus', 'ad', 'adalgis', 'adapt', 'adaptaca', 'adaptaco', 'adaptar', 'adelaid', 'adelin', 'adem', 'adend', 'adens', 'adentr', 'adequ', 'adequaca', 'ader', 'ades', 'adesa', 'adi', 'adiant', 'adic', 'adicion', 'adjunt', 'adm', 'administr', 'administraca', 'administracao', 'adminstraca', 'admiraca', 'admisnistr', 'admistr', 'adoca', 'adolesc', 'adolescenc', 'adona', 'ador', 'adormec', 

In [9]:
words = pd.DataFrame(tfidf_matrix.todense(), columns=vectorizer.get_feature_names())

In [10]:
words.tail()

Unnamed: 0,aba,abaix,abaixo,abanc,abandon,abdin,abert,abior,abn,abord,...,zik,zika,zikazer,zikazero,zohopublic,zon,zoolog,zoonos,zootecn,zoz
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
texto_teste = 'Divulgado o edital de iniciação científica'

In [109]:
texto_teste = processar_texto(texto_teste)

In [110]:
texto_teste = pd.DataFrame({'doc': [texto_teste]})

In [111]:
texto_teste = vectorizer.transform(texto_teste['doc'])

In [112]:
texto_teste = pd.DataFrame(texto_teste.todense(), columns=vectorizer.get_feature_names())

In [113]:
words_sim = words.append(texto_teste, ignore_index=True)

In [114]:
sim = cosine_similarity(words_sim)

In [117]:
df.loc[pd.Series(sim[-1]).sort_values(ascending=False)[1:6].index]

Unnamed: 0,data,titulo,conteudo,doc
699,05/07/2016,Divulgado o Resultado Provisório dos Projetos ...,\n\n\tDivulgado o Resultado Provisório dos Pro...,divulg result provisori projet iniciaca cienti...
946,15/02/2016,Publicada retificação do resultado final do Ca...,\n\n\tPublicada retificação do resultado final...,public retificaca result final campu barbacen ...
997,15/12/2015,Campus Barbacena divulga os resultados provisó...,\n\n\tCampus Barbacena divulga os resultados p...,campu barbacen divulg result provisori projet ...
939,17/02/2016,Oportunidade de bolsa de iniciação científica ...,\n\n\tFoi lançado o edital para seleção de um ...,oportun bols iniciaca cientif alun curs licenc...
660,08/08/2016,Estão abertas as inscrições para o VII Simpósi...,\n\n\tEstão abertas as inscrições para o VII S...,abert inscrico vii simposi pesquis inovaca vi ...
