In [32]:
import pandas
import nltk

from typing import List, Dict
from unidecode import unidecode
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from scipy import sparse

In [25]:
# loading dependencies
data = pandas.read_csv("estadao_noticias_eleicao.csv")

# cleaning null data
data = data.fillna('')

In [26]:
# joyning column of title and sub-title of artile with column of content.
data['articles'] = data['titulo']  + ' ' + data['subTitulo'] + ' ' + data['conteudo']

# lambda funcion to normalize text to lower case.
normalize = lambda text: unidecode(text.lower())

# lambda function to split text in tokens.
tokenize = lambda row: row.split()

# normalizing and tokenizing articles.
data['articles'] = data['articles'].apply(normalize)
data['tokens'] = data['articles'].apply(tokenize)

In [4]:
def create_index(matrix_of_tokens: List[str], doc_ids: List[int]) -> Dict[str, List[int]]:
    """Create a inverted index with all tokens and yours document Ids.
    :param matrix_of_tokens: matrix of article tokens lists.
    :param doc_ids: list of document ids of all articles.
    :returns: A inverted index with all tokens and yours document Ids.
    """
    index = {}
    for i in range(len(matrix_of_tokens)):
        for token in set(matrix_of_tokens[i]):
            if token in index.keys():
                index[token].append(doc_ids[i])
            else:
                index[token] = [doc_ids[i]]
    return index

In [5]:
# creating the inverted index
inverted_index = create_index(data['tokens'], data['idNoticia'])

In [12]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

In [33]:
stopword_ = stopwords.words('portuguese')
filtered_tokens = data['tokens'].apply(lambda tokens: [token for token in tokens if token not in stopword_])
                                     
tokens = [token for tokens_list in filtered_tokens for token in tokens_list]

In [34]:
matrix, vocab = co_occurrence_matrix(tokens)

In [35]:
consultable_matrix = matrix.tocsr()

def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

In [47]:
tree = list(zip(matrix.row, matrix.col, matrix.data))

In [102]:
index_to_vocab = [k for k, v in vocab.items()]

In [113]:
def more_words(word):
    index = index_to_vocab.index(word)
    filtered = list(filter((lambda x: x[0] == index or x[1] == index), tree))
    top_tree = list(map((lambda x: index_to_vocab[x[0]] if x[0] != index else index_to_vocab[x[1]]), filtered[:3]))
    return top_tree

more_words('lula')

['inacio', 'silva', 'silva,']