In [1]:
# Bibliotecas utilizadas para a implementação do TextRank

import numpy as np
import pandas as pd
import nltk 
import re

#from nltk.tokenize import sent_tokenize # divisão de texto em sents

from nltk.corpus import stopwords

import networkx as nx


from sklearn.metrics.pairwise import cosine_similarity


#from rouge import Rouge

In [2]:
# definintions 

# Os arquivos GloVe são "word embeddings" já processados. Ou seja, 
# são um conjunto de vetores de palavras já criado que
# faremos uso para a vetorização das nossas sentenças e palavras


glove = open('glove.6B/glove.6B.50d.txt',"r", encoding = 'utf-8') 
glove50 = open('glove.6B/glove.6B.50d.txt',"r", encoding = 'utf-8')
glove100 = open('glove.6B/glove.6B.100d.txt',"r", encoding = 'utf-8')
glove200 = open('glove.6B/glove.6B.200d.txt',"r", encoding = 'utf-8')
glove300 = open('glove.6B/glove.6B.300d.txt',"r", encoding = 'utf-8')

d = 50 # Escolha a dimensão dos vetores com base na dimensão do glove escolhido (50, 100, 200 ou 300)


# Documentos para análise, Dataset CNN daily mail tokenized

text = 'cnn_stories_tokenized/0a1e9a3bd86791e6c8ed3943eb36daf676c87f39.story'
text2 = 'cnn_stories_tokenized/0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story'
text3 = 'cnn_stories_tokenized/0a0aa464d262b903f44b0f8eaa67f13dd1946cfd.story'
text4 = 'cnn_stories_tokenized/0a0adc84ccbf9414613e145a3795dccc4828ddd4.story'


# Palavras sem valor agregado para o sentido dos textos em inglês
STOPWORDS = set(stopwords.words('english'))


In [5]:
# preprocess do prof usado para remover stop words
# como o dataset já está parcialmente preprocessado, não há necessidade de muito mais que isso

def preprocess(sent):
    
    def convert(word):
        # Verifica se é um número.
        try:
            _ = float(word)
            return '<num>'
        except:
            pass
        
        # Verifica se é uma palavra.
        if word.isalpha():
            lower = word.lower()
            return '<stop>' if lower in STOPWORDS else lower
        
        # Caso contrário, é pontuação ou estranho.
        return '<weird>'
    
    processed = [convert(word) for word in sent]
    forbidden_words = set(('<num>', '<stop>', '<weird>'))
    return [word for word in processed if word not in forbidden_words]

In [27]:
# Gerador de sentenças e palavras com base no documento recebido

def read_text(doc):
    
    file = open(doc, "r") 
    lines = file.readlines()
    
    file.close()
    sentences = [preprocess(item.strip().split()) # Preprocess para remover stopwords 
             for item in lines] #lines
    
    sentences = [item for item in sentences if item] # remoção de sentenças vazias "[]"
    
    words = []
    for line in sentences:
        for word in line: 
            words.append(word)
    words = set(words) # para impedir que uma palavra se repita
    
    return sentences, words

In [7]:
# Vetorização de sentenças ou palavras dependendo do que for inserido na função
# Vetorização feita com "word embeddings"

def vectorize(glove, sentences, d):
    
    word_embeddings ={}
    
    for line in glove: # Wordvectors predetermined with chosen GloVe from Manning, Socher and Pennington
    
        values = line.split()
    
        word = values[0]
    
        coefs = np.asarray(values[1:], dtype = 'float32')
    
        word_embeddings[word] = coefs
    
    #glove.close() # Descomentar isso para fechar os gloves abertos
    
    
    sentence_vectors = []

    for sents in sentences: 
    
        if len(sents) != 0: # aqui pode-se usar != ou > 
        
            vector = sum([word_embeddings.get(w, np.zeros((d,))) # vetores de cada palavra contida na sentença
                     for w in sents])/(len(sents)+0.001) # Média dos vetores com relação ao tamanho da sentença
        
        else:
        
            vector = np.zeros((d,)) 
        
        sentence_vectors.append(vector) 
            
    
    return sentence_vectors # Vetores ou de Sentenças ou Palavras dependendo do que for fornecido na função
    
    

In [8]:
# Construção da matriz de similaridade , tanto para sentenças como para palavras 

def similarity_matrix(sentences, sentence_vectors, d):

    
    sim_matrix = np.zeros([len(sentences), len(sentences)]) # matriz zerada
    
    for i in range(len(sentences)):
    
        for j in range(len(sentences)): 
        
            if i != j: 
            
                sim_matrix[i][j] = cosine_similarity(    # Distância cosseno
                    sentence_vectors[i].reshape(1,d), # d = dimensão escolhida dos vetores
                        sentence_vectors[j].reshape(1,d))[0,0]

    return sim_matrix

In [9]:
# PageRank 
# Cálcula os pesos que ranqueiam a importância de cada sentença/palavra. 
# Recebe como input a matriz de similaridade para realizar a computação dos pesos para ranquear. 

def PageRank(sim_matrix):
    
    nx_graph = nx.from_numpy_array(sim_matrix)

    scores = nx.pagerank(nx_graph)
    
    return scores


In [10]:
# Ordena as sentenças de acordo com a ordem de apresentação no texto original

def organize_sents(ranked_sentences, sentences, top_n):
    
    disorganized_summary = []
    
    for i in range(len(ranked_sentences)): # Pega todoas as sentenças ranqueadas
        disorganized_summary.append(ranked_sentences[i][1]) # Seleciona na ordem de maior "score"
        
    disorganized_summary = disorganized_summary[:top_n]  # top_n é a quantidade de sentenças desejada
    
    organized_summary = []
    
    for i in sentences:
        for j in disorganized_summary:
            
            if i == j:
                organized_summary.append(j) # adiciona as top_n sentenças na ordem de apresentação
            else:
                continue 
                
    summary = organized_summary # meramente simbólico
    
    return summary
                
    


In [11]:
# Ordena as palavras de acordo com a ordem de apresentação no texto original


def organize_words(ranked_words, words, top_n): 
    
    disorganized_words = []
    
    for i in range(top_n): # top_n é a quantidade de palavras-chave desejada
        disorganized_words.append(ranked_words[i][1])
    
    
    organized_words = []
    
    for i in words:
        for j in disorganized_words:
            
            if i == j:
                organized_words.append(j) # adiciona as top_n palavras na ordem de apresentação
            else:
                continue 
                
    keywords = organized_words  # meramente simbólico
    
    return keywords
                
    

In [13]:
# Seleciona os Highlights = Sumário oficial.

def highlights(doc):
    
    file = open(doc, "r") 
    lines = file.readlines()
    file.close()
    
    sentences = [item.strip().split() 
             for item in lines] 
    
    sentences = [item for item in sentences if item]
    
    
    queu = '@highlight'
    
    highlights = []
    
    
    for i in range(len(sentences)):
        
        if sentences[i][0] == queu:
            
            highlights.append(sentences[i+1]) # seleciona a frase sequinte do "@highlight"
            
        else:
            continue
                
    #print("highlights:")
    #print(highlights)
    
    return highlights
    

    
    
    
    
    

In [43]:
# Gerador de sumário com palavras-chaves


def summarize(doc, glove, d, top_n = 10):
    
    # Ler doc e gerar sentenças / palavras
    
    sentences = read_text(doc)[0]
    words = read_text(doc)[1]
    
    # Vetorização de sentenças / palavras 
    
    sentence_vectors = vectorize(glove, sentences, d)
    word_vectors = vectorize(glove, words , d)
    
    # Matriz de similaridade
    
    sent_sim_matrix = similarity_matrix(sentences, sentence_vectors, d)
    word_sim_matrix = similarity_matrix(words, word_vectors, d)
    
    # PageRanking 
    
    sent_scores = PageRank(sent_sim_matrix)
    word_scores = PageRank(word_sim_matrix)
    
    # Ranqueamento

    ranked_sentences = sorted(((sent_scores[i],s) for i,s in enumerate(sentences))
                         , reverse = True)
    ranked_words = sorted(((word_scores[i],s) for i,s in enumerate(words))
                        , reverse = True)
    
    # Extração do sumário e das palavras-chaves  
    
    summary = organize_sents(ranked_sentences, sentences, top_n)
    
    keywords = organize_words(ranked_words, words, top_n)
    
    # Sumário oficial 
    
    highlighted = highlights(doc)
    
    print("keywords")
    print(keywords)
    print()
    print("highlights")
    print(highlighted)
    print()
    print("summary")

    for i in range(len(summary)):
        print(summary[i])
        print("-"*70)
     
    
    return summary, keywords


In [44]:
# Exemplo de extração de sumário 

print("glove50")
d = 50
summary = summarize(text, glove50, d, 10) #decente  
print()

glove50
keywords
['yet', 'york', 'zheng', 'would', 'zone', 'zhangs', 'yaghmaian', 'zhengzhou', 'year', 'years']

highlights
[['Zhengzhou', 'is', 'home', 'to', 'what', 'was', 'once', 'called', 'China', "'s", 'largest', 'ghost', 'city'], ['Visitors', 'are', 'now', 'slowing', 'coming', 'back', ',', 'though', 'many', 'ca', "n't", 'afford', 'to', 'live', 'there'], ['Tighter', 'government', 'controls', 'on', 'housing', 'market', 'has', 'failed', 'to', 'curb', 'price', 'climbs'], ['Government', 'is', 'building', 'millions', 'of', 'low-cost', 'housing', 'to', 'meet', 'demand']]

summary
['zhengzhou', 'capital', 'central', 'henan', 'province', 'approach', 'mixed', 'results', 'years', 'back', 'new', 'development', 'zone', 'adjunct', 'main', 'city', 'labeled', 'china', 'largest', 'ghost', 'city', 'rows', 'rows', 'luxury', 'apartments', 'office', 'buildings', 'sat', 'empty', 'vast', 'deserted', 'boulevards']
----------------------------------------------------------------------
['went', 'li', 'cai

In [45]:
# Comparando o Texto 2 com diferentes gloves
# glove50
print("glove50")
d = 50
summarize(text2, glove50, d, 10) #decente 
print()

#glove100
print('glove100')
d = 100
summarize(text2, glove100, d, 10) #decente 
print()

#glove200
print('glove200')
d = 200
summarize(text2, glove100, d, 10) #decente 
print()

#glove300
print('glove300')
d = 300
summarize(text2, glove100, d, 10) #decente 
print()


glove50
keywords
['watch', 'within', 'wrong', 'zero', 'way', 'went', 'website', 'whether', 'week', 'watching']

highlights
[['Dean', 'Obeidallah', ':', 'A', 'movie', 'or', 'TV', 'show', 'can', 'educate', 'or', '-LRB-', 'mis', '-RRB-', 'educate', 'you'], ['Obeidallah', ':', 'Two', 'new', 'films', 'about', 'hot', 'issues', 'are', 'firing', 'up', 'both', 'the', 'left', 'and', 'right'], ['Senators', 'slammed', '``', 'Zero', 'Dark', 'Thirty', ',', "''", 'and', 'energy', 'industry', 'attacked', '``', 'Promised', 'Land', "''"], ['Obeidallah', ':', 'What', 'does', 'Hollywood', 'want', '?', 'To', 'make', 'money', ',', 'of', 'course']]

summary
['scoff', 'notion', 'movies', 'anything', 'entertain', 'wrong', 'sure', 'unlikely', 'one', 'movie', 'alone', 'change', 'views', 'issues', 'magnitude', 'movie', 'tv', 'show', 'begin', 'education', 'miseducation', 'topic', 'already', 'agreeing', 'film', 'thesis', 'entrench', 'views']
----------------------------------------------------------------------
['p

keywords
['watch', 'within', 'wrong', 'zero', 'way', 'went', 'website', 'whether', 'week', 'watching']

highlights
[['Dean', 'Obeidallah', ':', 'A', 'movie', 'or', 'TV', 'show', 'can', 'educate', 'or', '-LRB-', 'mis', '-RRB-', 'educate', 'you'], ['Obeidallah', ':', 'Two', 'new', 'films', 'about', 'hot', 'issues', 'are', 'firing', 'up', 'both', 'the', 'left', 'and', 'right'], ['Senators', 'slammed', '``', 'Zero', 'Dark', 'Thirty', ',', "''", 'and', 'energy', 'industry', 'attacked', '``', 'Promised', 'Land', "''"], ['Obeidallah', ':', 'What', 'does', 'Hollywood', 'want', '?', 'To', 'make', 'money', ',', 'of', 'course']]

summary
['scoff', 'notion', 'movies', 'anything', 'entertain', 'wrong', 'sure', 'unlikely', 'one', 'movie', 'alone', 'change', 'views', 'issues', 'magnitude', 'movie', 'tv', 'show', 'begin', 'education', 'miseducation', 'topic', 'already', 'agreeing', 'film', 'thesis', 'entrench', 'views']
----------------------------------------------------------------------
['peter', '

In [26]:
# Extração do Sumário

# Comparando os resultados do Texto 1 com diferentes gloves
# glove50
print("glove50")
d = 50
summarize(text, glove50, d, 10) #decente 
print()

#glove100
print('glove100')
d = 100
summarize(text, glove100, d, 10) #decente 
print()

#glove200
print('glove200')
d = 200
summarize(text, glove100, d, 10) #decente 
print()

#glove300
print('glove300')
d = 300
summarize(text, glove100, d, 10) #decente 
print()


glove50
keywords
['worked', 'yet', 'would', 'work', 'world', 'zone', 'workers', 'you', 'year', 'years']

highlights
[['Zhengzhou', 'is', 'home', 'to', 'what', 'was', 'once', 'called', 'China', "'s", 'largest', 'ghost', 'city'], ['Visitors', 'are', 'now', 'slowing', 'coming', 'back', ',', 'though', 'many', 'ca', "n't", 'afford', 'to', 'live', 'there'], ['Tighter', 'government', 'controls', 'on', 'housing', 'market', 'has', 'failed', 'to', 'curb', 'price', 'climbs'], ['Government', 'is', 'building', 'millions', 'of', 'low-cost', 'housing', 'to', 'meet', 'demand']]

summary
['``', 'Currently', 'China', "'s", 'urban', 'population', 'is', 'a', 'little', 'over', '700', 'million', 'people', ',', "''", 'said', 'Tom', 'Miller', ',', 'author', 'of', '``', 'China', "'s", 'Urban', 'Billion', '.', "''"]
----------------------------------------------------------------------
['``', 'By', '2030', ',', 'we', 'would', 'expect', 'it', 'to', 'be', 'one', 'billion', '.', 'One', 'in', 'eight', 'people', 'on

keywords
['worked', 'yet', 'would', 'work', 'world', 'zone', 'workers', 'you', 'year', 'years']

highlights
[['Zhengzhou', 'is', 'home', 'to', 'what', 'was', 'once', 'called', 'China', "'s", 'largest', 'ghost', 'city'], ['Visitors', 'are', 'now', 'slowing', 'coming', 'back', ',', 'though', 'many', 'ca', "n't", 'afford', 'to', 'live', 'there'], ['Tighter', 'government', 'controls', 'on', 'housing', 'market', 'has', 'failed', 'to', 'curb', 'price', 'climbs'], ['Government', 'is', 'building', 'millions', 'of', 'low-cost', 'housing', 'to', 'meet', 'demand']]

summary
['``', 'Currently', 'China', "'s", 'urban', 'population', 'is', 'a', 'little', 'over', '700', 'million', 'people', ',', "''", 'said', 'Tom', 'Miller', ',', 'author', 'of', '``', 'China', "'s", 'Urban', 'Billion', '.', "''"]
----------------------------------------------------------------------
['``', 'By', '2030', ',', 'we', 'would', 'expect', 'it', 'to', 'be', 'one', 'billion', '.', 'One', 'in', 'eight', 'people', 'on', 'Eart

In [20]:
# Comparando o Texto 3 com diferentes gloves
# glove50
print("glove50")
d = 50
summarize(text3, glove50, d, 10) #decente 
print()

#glove100
print('glove100')
d = 100
summarize(text3, glove100, d, 10) #decente 
print()

#glove200
print('glove200')
d = 200
summarize(text3, glove100, d, 10) #decente 
print()

#glove300
print('glove300')
d = 300
summarize(text3, glove100, d, 10) #decente 
print()


glove50
['worked', 'winner', 'wild', 'worth', 'would', 'written', 'world', 'without', 'year', 'years']
[['Math', 'geeks', 'and', 'others', 'celebrate', 'Pi', 'Day', 'every', 'March', '14'], ['Pi', ',', 'or', 'roughly', '3.14', ',', 'is', 'the', 'ratio', 'of', 'circumference', 'to', 'diameter', 'of', 'a', 'circle'], ['The', 'Pi', 'Day', 'holiday', 'idea', 'started', 'at', 'the', 'Exploratorium', 'museum', 'in', 'San', 'Francisco'], ['Albert', 'Einstein', 'was', 'also', 'born', 'on', 'March', '14']]

['quick', 'refresher', 'pi', 'defined', 'distance', 'around', 'perfect', 'circle', 'circumference', 'divided', 'distance', 'across', 'diameter', 'also', 'involved', 'calculating', 'area', 'circle', 'volume', 'sphere', 'many', 'mathematical', 'formulas', 'might', 'need', 'sciences']
----------------------------------------------------------------------
['throughout', 'history', 'people', 'captivated', 'number', 'way', 'calculate', 'exactly', 'simple', 'division', 'calculator', 'digits', 'go',

['worked', 'winner', 'wild', 'worth', 'would', 'written', 'world', 'without', 'year', 'years']
[['Math', 'geeks', 'and', 'others', 'celebrate', 'Pi', 'Day', 'every', 'March', '14'], ['Pi', ',', 'or', 'roughly', '3.14', ',', 'is', 'the', 'ratio', 'of', 'circumference', 'to', 'diameter', 'of', 'a', 'circle'], ['The', 'Pi', 'Day', 'holiday', 'idea', 'started', 'at', 'the', 'Exploratorium', 'museum', 'in', 'San', 'Francisco'], ['Albert', 'Einstein', 'was', 'also', 'born', 'on', 'March', '14']]

['quick', 'refresher', 'pi', 'defined', 'distance', 'around', 'perfect', 'circle', 'circumference', 'divided', 'distance', 'across', 'diameter', 'also', 'involved', 'calculating', 'area', 'circle', 'volume', 'sphere', 'many', 'mathematical', 'formulas', 'might', 'need', 'sciences']
----------------------------------------------------------------------
['throughout', 'history', 'people', 'captivated', 'number', 'way', 'calculate', 'exactly', 'simple', 'division', 'calculator', 'digits', 'go', 'infini

In [21]:
# Comparando o Texto 4 com diferentes gloves
# glove50
print("glove50")
d = 50
summarize(text4, glove50, d, 10) #decente 
print()

#glove100
print('glove100')
d = 100
summarize(text4, glove100, d, 10) #decente 
print()

#glove200
print('glove200')
d = 200
summarize(text4, glove100, d, 10) #decente 
print()

#glove300
print('glove300')
d = 300
summarize(text4, glove100, d, 10) #decente 
print()


glove50
['wo', 'whose', 'within', 'want', 'weeks', 'way', 'went', 'would', 'windowless', 'wang']
[['MH370', 'families', 'hold', 'sit-in', 'outside', 'the', 'Malaysian', 'Embassy', 'in', 'Beijing'], ['Relatives', 'marched', 'from', 'their', 'hotel', 'after', 'request', 'to', 'meet', 'Malaysian', 'ambassador', 'failed'], ['More', 'than', 'once', 'in', 'recent', 'weeks', 'Malaysian', 'authorities', 'have', 'not', 'shown', 'up', 'for', 'talks', 'with', 'relatives'], ['NEW', ':', 'China', 'appeals', 'to', 'protesters', 'to', 'express', 'concerns', 'in', '``', 'legal', 'and', 'rational', 'way', "''"]]

['unsuccessfully', 'demanding', 'meeting', 'malaysian', 'ambassador', 'hotel', 'eight', 'hours', 'angry', 'relatives', 'pushed', 'past', 'police', 'officers', 'tried', 'stop', 'making', 'midnight', 'march', 'across', 'chinese', 'capital', 'destination', 'malaysian', 'embassy']
----------------------------------------------------------------------
['surreal', 'procession', 'followed', 'closely'

['wo', 'whose', 'within', 'want', 'weeks', 'way', 'went', 'would', 'windowless', 'wang']
[['MH370', 'families', 'hold', 'sit-in', 'outside', 'the', 'Malaysian', 'Embassy', 'in', 'Beijing'], ['Relatives', 'marched', 'from', 'their', 'hotel', 'after', 'request', 'to', 'meet', 'Malaysian', 'ambassador', 'failed'], ['More', 'than', 'once', 'in', 'recent', 'weeks', 'Malaysian', 'authorities', 'have', 'not', 'shown', 'up', 'for', 'talks', 'with', 'relatives'], ['NEW', ':', 'China', 'appeals', 'to', 'protesters', 'to', 'express', 'concerns', 'in', '``', 'legal', 'and', 'rational', 'way', "''"]]

['unsuccessfully', 'demanding', 'meeting', 'malaysian', 'ambassador', 'hotel', 'eight', 'hours', 'angry', 'relatives', 'pushed', 'past', 'police', 'officers', 'tried', 'stop', 'making', 'midnight', 'march', 'across', 'chinese', 'capital', 'destination', 'malaysian', 'embassy']
----------------------------------------------------------------------
['surreal', 'procession', 'followed', 'closely', 'polic

In [169]:
# Testando rouge 
d = 50
rouge = Rouge()

hypothesis = summarize(text, glove50, d, 10) #decente 

reference = highlights(text)



['zheng', 'would', 'zhangs', 'zone', 'yet', 'york', 'year', 'zhengzhou', 'years', 'yaghmaian']
['zhengzhou', 'capital', 'central', 'henan', 'province', 'approach', 'mixed', 'results', 'years', 'back', 'new', 'development', 'zone', 'adjunct', 'main', 'city', 'labeled', 'china', 'largest', 'ghost', 'city', 'rows', 'rows', 'luxury', 'apartments', 'office', 'buildings', 'sat', 'empty', 'vast', 'deserted', 'boulevards']
----------------------------------------------------------------------
['went', 'li', 'cai', 'juan', 'friends', 'several', 'years', 'ago', 'office', 'job', 'construction', 'company', 'built', 'zhengzhou', 'rows', 'towers']
----------------------------------------------------------------------
['would', 'expect', 'one', 'billion', 'one', 'eight', 'people', 'earth', 'live', 'chinese', 'city', 'still', 'need', 'lot', 'building']
----------------------------------------------------------------------
['zhengzhou', 'population', 'boom']
--------------------------------------------

In [166]:
"""
overall_scores = []

for i in range(len(hypothesis[0])):
    
    for j in range(len(reference)):
        
        scores = rouge.get_scores(hypothesis[0][i], reference[j])
    
        overall_scores.append(scores)"""

'\noverall_scores = []\n\nfor i in range(len(hypothesis[0])):\n    \n    for j in range(len(reference)):\n        \n        scores = rouge.get_scores(hypothesis[0][i], reference[j])\n    \n        overall_scores.append(scores)'

pip install easy-rouge

ModuleNotFoundError: No module named 'easyrouge'

In [None]:
overall_scores = []

for i in range(len(hypothesis[0])):
    
    for j in range(len(reference)):
        
        scores = rouge.get_scores(hypothesis[0][i], reference[j])
    
        overall_scores.append(scores)