In [1]:
# Importing text processing utilities
import re

# Type Utilities

from typing import *

# Importing NLP libs
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import spacy

# Import keyword extraction library
import yake

In [None]:
# Loading variables
nlp = spacy.load('pt_core_news_lg')

In [2]:
def read_stopwords(path: str) -> List[str]:
    with open(path) as f:
        lines = f.readlines()
        stopwords = [line.strip() for line in lines]
    return stopwords

In [3]:
stopwords = read_stopwords("../stopwords/stopwords.txt")

In [None]:
def remove_stopwords(sent: str, stopwords: List[str]) -> str:
    """ Removes stopwords from a given sentence"""
    tokens = re.split(r"\s+", text)
    tokens_without_stopwords = [token for token in tokens if token not in stopwords]
    clean_sent = ' '.join(tokens_without_stopwords)
    return clean_sent

In [33]:
keywords_params = {
    'lan': "pt",
    'n': 3,
    'dedupLim': 0.7,
    'dedupFunc': 'seqm',
    'windowsSize': 4,
    'top': 10
}

In [35]:
def make_gensim_compatible_keyword(keyword: str) -> str:
    return re.sub("\s+", "_", keyword)

In [21]:
def generate_keywords(sent: str, params: dict):
    """
    Params:
    ---------------
    sent: str
     A string containing a stopword-free sentence

    params: dict
     A dictionary containing parameters to customize the YAKE-algorithm
    """
    text_lowercase = sent.lower()
    
    kw_extractor = yake.KeywordExtractor(**keywords_params)
    keywords_w_weights = kw_extractor.extract_keywords(text_lowercase)
    keywords = [keyword[1] for keyword in keywords_w_weights]

    compat_keywords = [make_gensim_compatible_keyword(keyword) for keyword in keywords]

    return compat_keywords


('participação audição pública', 0.01655915082773619)
('europeu luta cancro', 0.016559150827736194)
('audição pública plano', 0.03339840940482845)
('pública plano europeu', 0.03339840940482845)
('plano europeu luta', 0.03339840940482845)
('participação audição', 0.04940384002065631)
('luta cancro', 0.04940384002065631)
('audição pública', 0.09700399286574239)
('pública plano', 0.09700399286574239)
('plano europeu', 0.09700399286574239)
('europeu luta', 0.09700399286574239)
('participação', 0.15831692877998726)
('cancro', 0.15831692877998726)
('audição', 0.29736558256021506)
('pública', 0.29736558256021506)
('plano', 0.29736558256021506)
('europeu', 0.29736558256021506)
('luta', 0.29736558256021506)
