In [1]:
from xml.dom import minidom
from unidecode import unidecode
import sys
import pandas as pd
import numpy as np
import json
sys.path.insert(1, '../utils')
from read_config import read_config_file

In [2]:
inputs_path = '../inputs/'
results_path = '../results/'

## Read TXT config file

In [3]:
config_dict = {}
read_config_file(inputs_path + 'INDEX.CFG', config_dict)
config_dict

{'LEIA': ['lista.csv'], 'ESCREVA': ['modelo.csv.gz']}

## Read GLI

In [4]:
lista_df = pd.read_csv(results_path + config_dict['LEIA'][0], sep=';')
lista_df.head()

Unnamed: 0,WORDS,RECORDS
0,the,"['00001', '00001', '00001', '00001', '00001', ..."
1,significance,"['00001', '00074', '00078', '00121', '00147', ..."
2,of,"['00001', '00001', '00001', '00001', '00001', ..."
3,pseudomonas,"['00001', '00001', '00001', '00007', '00008', ..."
4,aeruginosa,"['00001', '00001', '00001', '00006', '00006', ..."


In [5]:
filtered_lista_df = lista_df[
    (lista_df['WORDS'].str.len() >= 2) &
    (lista_df['WORDS'].str.isalpha()) 
]


#### TF-IDF paramters

- N = number of documents
- $n_j$ = number of documents with word j
- $tf_{ij}$= frequency of j in i


In [6]:
freq_dict = {}
word_doc_dict = {}
docs_max_freq = {}
for index, row in filtered_lista_df.iterrows():
    word = row['WORDS']
    records = eval(row['RECORDS'])
    
    word_doc_dict[word] = len(set(records))
    for doc in records:
        if doc in  docs_max_freq:
            docs_max_freq[doc] = max(docs_max_freq[doc], records.count(doc))
        else:
            docs_max_freq[doc] = records.count(doc)
        freq_dict.setdefault(word, {})[doc] = records.count(doc)

## Geneta Matrix with tf-idf as weights

In [7]:
def tf_idf_default(word_doc_dict, freq_dict, docs_max_freq):
    words_list = word_doc_dict.keys()
    docs_list = docs_max_freq.keys()
    N = len(docs_list)
    tf_matrix = {}
    for doc in docs_list:
        for word in words_list:
            if doc in freq_dict[word]:
                tf_matrix.setdefault(word, {})[doc] = (freq_dict[word][doc]/docs_max_freq[doc]) * np.log(N/word_doc_dict[word])
            else:
                tf_matrix.setdefault(word, {})[doc] = 0 
    return tf_matrix

In [20]:
tf_matrix = tf_idf_default(word_doc_dict, freq_dict, docs_max_freq)
tf_matrix_df = pd.DataFrame(tf_matrix)

In [21]:
tf_matrix_df = tf_matrix_df.reset_index(names=['doc'])
tf_matrix_df.head()

Unnamed: 0,doc,the,significance,of,pseudomonas,aeruginosa,infection,in,respiratory,tract,...,answers,counterpart,informed,consent,patchy,theorize,planned,appreciably,theoretically,dl
0,1,0.022473,0.19222,0.011033,0.423875,0.421806,0.395489,0.01754,0.367281,0.475601,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.013484,0.0,0.015761,0.0,0.0,0.0,0.046773,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.022473,0.0,0.015761,0.0,0.0,0.0,0.025057,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.022473,0.0,0.011558,0.0,0.0,0.0,0.03508,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.016855,0.0,0.005911,0.0,0.0,0.0,0.058466,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
tf_matrix_df.to_csv(results_path + config_dict['ESCREVA'][0], sep=';', compression="gzip", index=False)