In [None]:
import yaml
import re
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
import string
import gensim
from gensim import corpora
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import requests
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import operator
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Tópicos
## LDA

Link para baixar os dados: https://drive.google.com/drive/folders/1q7BQG_OdXkBEM_qE9BZ5eZu_EJzhOg1q?usp=drive_link

In [None]:
from google.colab import drive
drive.mount('/content/drive')
mypath = "/content/drive/MyDrive/aulas/Processamento de Linguagem Natural - Saude/Modelos de NLP/chat/"

files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and 'yml' in f]

In [None]:
files

In [None]:
all_chats = {}
for f in files:
    print("file ", f)
    with open(mypath+f, encoding="utf-8") as file:
        chat = yaml.safe_load(file)
        all_chats[chat["categories"][0]] = chat["conversations"]

In [None]:
all_chats['games']

A variável ``chat`` é um dicionário, que possui a chave ``conversations``, que contém várias listas. Dentro de cada lista, existe um par de frases, uma escrita por um humano e outra escrita por um robô. Neste exercício, só iremos considerar a frase do humano, ou seja, a primeira posição do par nas listas.
Exemplo:

In [None]:
for k in all_chats.keys():
    all_chats[k] = ' '.join([c[0] for c in all_chats[k]])

In [None]:
all_chats['games']

## Preprocessamento

In [None]:
def pre_processamento_texto(corpus):
    corpus_alt = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    corpus_alt = [t.lower() for t in corpus_alt]
    portugues_stops = stopwords.words('portuguese')
    corpus_alt = [t for t in corpus_alt if t not in portugues_stops]
    corpus_alt = [t for t in corpus_alt if t not in string.punctuation]

    return corpus_alt

In [None]:
nltk.download('stopwords')

In [None]:
all_chats_clean = []
for k in all_chats.keys():
    all_chats_clean.append(pre_processamento_texto(all_chats[k]))

In [None]:
all_chats_clean[0]

In [None]:
###Criando um dicionario com o vocabulario

dictionary = corpora.Dictionary(all_chats_clean)

In [None]:
dictionary[11]

In [None]:
##Convertendo o documento em indices
doc_term_matrix = [dictionary.doc2bow(doc) for doc in all_chats_clean]

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix,
               id2word = dictionary,
               num_topics=6,
               passes=100,
               random_state=42)

In [None]:
ldamodel.show_topics()

# Sumarização

##Coleta do dado

In [None]:
noticia_url = "https://g1.globo.com/tecnologia/noticia/2022/03/12/instagram-restrito-na-russia-entenda-a-importancia-da-rede-social-para-o-pais-de-putin.ghtml"

In [None]:
def match_class(target):
    def do_match(tag):
        classes = tag.get('class', [])
        return all(c in classes for c in target)
    return do_match

def get_text_url(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    #remove marcações de scripts e style
    texto = soup.find_all(match_class(["content-text__container"]))
    all_text = ""
    for t in texto:
        all_text += t.get_text()
    return all_text

In [None]:
texto_noticia = get_text_url(noticia_url)

## Pré-processamento

In [None]:
nltk.download('punkt')
corpus_sent = sent_tokenize(texto_noticia)

In [None]:
corpus_processado = [pre_processamento_texto(sent) for sent in corpus_sent]

Para realizar a sumarização iremos criar os seguintes métodos;

1. Um que calcula a similaridade de frases. Usando Bow
2. Iremos construir uma matriz de similaridade
3. Iremos fazer o rank das frases utilizando o método pagerank. Este método utiliza da representação de grafos então transformaremos nosso dado em um grafo.
4. Ordene o score e retorne os 5 primeiros.

In [None]:
def similarity_sentences(sent1, sent2):
    all_words = list(set(sent1 + sent2))
    vect_bag = CountVectorizer(binary=False, analyzer="word")
    vect_bag.fit(all_words)
    #aplica BOW
    vec_sent1 = np.asarray(vect_bag.transform([' '.join(sent1)]).todense())
    vec_sent2 = np.asarray(vect_bag.transform([' '.join(sent2)]).todense())

    return cosine_similarity(vec_sent1, vec_sent2).reshape(-1)[0]

In [None]:
def matrix_similarity(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for ix in range(0,len(sentences)):
        for ix2 in range(0,len(sentences)):
            if ix == ix2:
                continue
            similarity_matrix[ix][ix2] = similarity_sentences(sentences[ix], sentences[ix2])
    return similarity_matrix

In [None]:
def get_pagerank(texto):
    sentences_matrix_similarity = matrix_similarity(texto)
    sentence_similarity_graph = nx.from_numpy_array(sentences_matrix_similarity)
    scores = nx.pagerank(sentence_similarity_graph)

    return scores

In [None]:
def summarize(scores):
    scores_sorted = dict(sorted(scores.items(), key=operator.itemgetter(1),reverse=True))
    rank_sentences = list(scores_sorted.keys())[:5]
    summarize_text = ""
    for r in rank_sentences:
        summarize_text += (corpus_sent[r]) + " "
    return summarize_text

In [None]:
summarize(get_pagerank(corpus_processado))

In [None]:
## Trocando para tfidf
def similarity_sentences(sent1, sent2):
    all_words = list(set(sent1 + sent2))
    vect = TfidfVectorizer()

    vect.fit(all_words)
    vec_sent1 = np.asarray(vect.transform([' '.join(sent1)]).todense())
    vec_sent2 = np.asarray(vect.transform([' '.join(sent2)]).todense())

    return cosine_similarity(vec_sent1, vec_sent2).reshape(-1)[0]

In [None]:
summarize(get_pagerank(corpus_processado))