Database operations to build the structure for visualization

I will need 4 groups of data:
* topics (aggregator)
* docs
* words
* people

In [1]:
import os
import pandas as pd
import sqlite3
import json
import pickle
import re
import math
import gensim
from collections import Counter

In [2]:
inputs = os.path.join("..", "inputs")
outputs = os.path.join('..','outputs')

Rename topics from ~5000 to ~0 (also reorder according to heatmap)
* topics
* topic_doc

# replace pickle with sqlite

In [4]:
sql_db = os.path.join(outputs, 'cpdoc_as.sqlite')
conn = sqlite3.connect(sql_db)
cur = conn.cursor()

topic_doc = pd.read_sql_query("SELECT * FROM topic_doc_new", conn) # loading from pkl below
docs = pd.read_sql_query("SELECT * FROM docs", conn)
persons = pd.read_sql_query("SELECT * FROM persons", conn)
person_doc = pd.read_sql_query("SELECT * FROM person_doc", conn)

In [5]:
topic_doc = topic_doc.sort_values(['topic_id', 'topic_score'], ascending=[True, False])
print(len(topic_doc))
topic_doc.head()

114503


Unnamed: 0,doc_id,topic_id,topic_score
38431,bp_1974.06.04_doc_I-8,0,0.80207
39478,bp_1974.06.04_doc_IV-52,0,0.7525
9804,ag_1974.01.22_doc_III-55,0,0.710143
100930,rb_1974.04.17_doc_I-25,0,0.695817
105726,rb_1974.05.23_doc_III-11,0,0.689169


In [6]:
print(len(docs))
docs.head()

10268


Unnamed: 0,id,main_language,readability,url,body
0,ag_1973.11.20_doc_I-1,none,0.4,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\njr, /h ' ""& m$ hb'h'qo\n& la'?\n\n, é %%3..."
1,ag_1973.11.20_doc_I-4,none,-1.0,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\n""**; %wwffwç (. sz\nao?\n\nray\n\nmax meo..."
2,ag_1973.11.20_doc_I-5,none,-1.0,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"am 1933 m 90\n\nw ij""?\n(\n\nt\n\ne\nzi\n\nurd..."
3,ag_1973.11.20_doc_I-7,pt,-1.0,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"ôôô/ma 44 w\nwil""\n/ oe;\n\n \n\ncentro de es..."
4,ag_1973.11.20_doc_I-8,pt,0.615385,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\nxx>4áiãfkk ;g*\n\nmv, ; (905\n\n \n\ntele..."


# docs and topics

In [7]:
topic_doc = topic_doc.sort_values(['topic_id', 'topic_score'], ascending=[True, False])
topic_doc.head()

Unnamed: 0,doc_id,topic_id,topic_score
38431,bp_1974.06.04_doc_I-8,0,0.80207
39478,bp_1974.06.04_doc_IV-52,0,0.7525
9804,ag_1974.01.22_doc_III-55,0,0.710143
100930,rb_1974.04.17_doc_I-25,0,0.695817
105726,rb_1974.05.23_doc_III-11,0,0.689169


# docs and persons

In [9]:
# persons.rename(columns={"person_name": "name"}, inplace=True)
persons.rename(index=str, columns={"id": "person_id", "person_name": "name"}, inplace=True)

In [10]:
# person_doc = cpdoc_tables['person_doc ']
person_doc = pd.merge(person_doc, persons, on='person_id', how='inner')
person_doc = person_doc[['person_id', 'doc_id', 'person_count', 'name']]
person_doc = person_doc.replace('(.*),(.*),.*', r'\1,\2', regex=True)
person_doc = person_doc.replace('(.*), (.*)', r'\2 \1', regex=True)

In [11]:
#remove special characters
person_doc = person_doc.name.apply(pd.Series) \
    .replace('[“”]', '', regex=True) \
    .replace('[áàãâ]', 'a', regex=True) \
    .replace('[óòõô]', 'o', regex=True) \
    .replace('[éèê]', 'e', regex=True) \
    .replace('[íì]', 'i', regex=True) \
    .replace('[úù]', 'u', regex=True) \
    .replace('ç', 'c', regex=True) \
    .merge(person_doc, left_index = True, right_index = True) \
    .drop("name", axis=1) \
    .rename(columns={0: "name"})

In [12]:
person_doc.head()

Unnamed: 0,name,person_id,doc_id,person_count
0,Antonio Azeredo Da Silveira,500084,ag_1973.11.20_doc_I-8,1
1,Antonio Azeredo Da Silveira,500084,ag_1973.11.20_doc_I-9,1
2,Antonio Azeredo Da Silveira,500084,ag_1973.11.20_doc_I-13,1
3,Antonio Azeredo Da Silveira,500084,ag_1973.11.20_doc_I-22,1
4,Antonio Azeredo Da Silveira,500084,ag_1973.11.20_doc_I-23,1


In [13]:
person_doc_count = pd.DataFrame(person_doc.groupby(['name'])['person_count'].agg('count')) #.apply(sum)
person_doc_count = person_doc_count.reset_index()

# docs and tokens

In [14]:
docs.head()

Unnamed: 0,id,main_language,readability,url,body
0,ag_1973.11.20_doc_I-1,none,0.4,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\njr, /h ' ""& m$ hb'h'qo\n& la'?\n\n, é %%3..."
1,ag_1973.11.20_doc_I-4,none,-1.0,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\n""**; %wwffwç (. sz\nao?\n\nray\n\nmax meo..."
2,ag_1973.11.20_doc_I-5,none,-1.0,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"am 1933 m 90\n\nw ij""?\n(\n\nt\n\ne\nzi\n\nurd..."
3,ag_1973.11.20_doc_I-7,pt,-1.0,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"ôôô/ma 44 w\nwil""\n/ oe;\n\n \n\ncentro de es..."
4,ag_1973.11.20_doc_I-8,pt,0.615385,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\nxx>4áiãfkk ;g*\n\nmv, ; (905\n\n \n\ntele..."


In [15]:
docs = docs[['id', 'url', 'body']]
docs = docs.rename(index=str, columns={"id": "doc_id"})
docs['length'] = docs['body'].apply(lambda x: len(x.split()))
docs.head()

Unnamed: 0,doc_id,url,body,length
0,ag_1973.11.20_doc_I-1,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\njr, /h ' ""& m$ hb'h'qo\n& la'?\n\n, é %%3...",261
1,ag_1973.11.20_doc_I-4,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\n""**; %wwffwç (. sz\nao?\n\nray\n\nmax meo...",54
2,ag_1973.11.20_doc_I-5,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"am 1933 m 90\n\nw ij""?\n(\n\nt\n\ne\nzi\n\nurd...",64
3,ag_1973.11.20_doc_I-7,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"ôôô/ma 44 w\nwil""\n/ oe;\n\n \n\ncentro de es...",75
4,ag_1973.11.20_doc_I-8,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\nxx>4áiãfkk ;g*\n\nmv, ; (905\n\n \n\ntele...",202


# words and topics

In [16]:
file_lda_model = os.path.join(inputs,'model_lda_100_rs_00.pkl')
lda_model = pickle.load(open(file_lda_model, 'rb'))

In [17]:
def remove_special_char(text):
    text = re.sub('[áàãâ]', 'a', text)
    text = re.sub('[óòõô]', 'o', text)
    text = re.sub('[éèê]', 'e', text)
    text = re.sub('[íì]', 'i', text)
    text = re.sub('[úù]', 'u', text)
    text = re.sub('ç', 'c', text)
    return text

In [18]:
def retrieve_tokens(text, df):
    main_tokens = []
    for token in df['tokens']:
        if token in text:
            token = remove_special_char(token)
            main_tokens.append(token)
    return main_tokens

# build vis table

In [22]:
def build_vis_table(topic_id, df):
    topic_tokens = lda_model.print_topics(-1, num_words=20)[topic_id]
    
    tokens = []
    scores = []
    for i in topic_tokens[1].split('+'):
        token = re.sub('.*\*"(.*)".*', r'\1', i)
        score = re.sub(' *(.*)\*.*', r'\1', i)
        score = float(score)
        tokens.append(token)
        scores.append(score)
    token_score_dict = {'tokens': tokens, 'scores': scores}
    token_score = pd.DataFrame(token_score_dict)
    
    docs['tokens'] = docs['body'].apply(lambda text: retrieve_tokens(text, token_score))
    
    #filter by topic
    df = df.loc[df['topic_id'] == topic_id].sort_values(by=['topic_score'], ascending=False)
    df = df.head(20)
    
    ### merge topics and tokens
    df = pd.merge(df, docs, on='doc_id', how='inner')
    
    ### relate docs to list of persons
    #filters person_doc
    array = list(df['doc_id'])
    person_doc_filtered = person_doc.loc[person_doc['doc_id'].isin(array)]
    
    #apply list of persons
    person_doc_filtered = person_doc_filtered.groupby(['doc_id'])['name'].apply(list)
    person_doc_filtered = pd.DataFrame({'doc_id':person_doc_filtered.index, 'names':person_doc_filtered.values})
    
    #merge topics and persons
    df = pd.merge(df, person_doc_filtered, on='doc_id', how='outer')
#     df['year'] = df['date'].apply(lambda x: x.year)
    df = df.astype({'names': 'object'})
    for row in df.loc[df.names.isnull(), 'names'].index:
        df.at[row, 'names'] = []
    
    return df

In [23]:
topic_vis_dict = {}

In [24]:
for i in range(100):
    topic_vis_dict[i] = build_vis_table(i, topic_doc)

In [25]:
topic_vis_dict.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [26]:
topic_vis_dict[7].tail()

Unnamed: 0,doc_id,topic_id,topic_score,url,body,length,tokens,names
15,pn_1975.00.00_doc_6,7,0.310567,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\n \n \n\ná4ó (%?ç'quoicg; vê!/avó m & ""l...",251,"[nuclear, nucleares, brasil, armas, tratado]",[]
16,pn_1974.08.15_doc_III-31,7,0.303898,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,secreto-excâusãvo\n\nsubsídios para as consult...,4549,"[nuclear, acordo, energia, nucleares, brasil, ...","[Jimmy Carter, Cyrus Vance, Helmut Schmidt]"
17,pn_1976.12.28_doc_29,7,0.300879,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"telegrama recebido '\n\naaaa [97034 . é?\no"" '...",845,"[nuclear, acordo, energia, nucleares, brasil, ...",[Jimmy Carter]
18,pn_1974.08.15_doc_II-1,7,0.290118,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\nmmm\n\n""a. . .. , . ..\n\n' o acordo nucl...",1672,"[nuclear, acordo, energia, nucleares, brasil, ...",[Jimmy Carter]
19,pn_1976.12.28_doc_16,7,0.286759,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,antonio la'. aiii-1315230 da sxmrâxm\njáwówa i...,330,"[acordo, energia, brasil, uranio, rfa, salvagu...",[Antonio Azeredo Da Silveira]


In [138]:
topic_vis_dict[49].tail()

Unnamed: 0,doc_id,topic_id,topic_score,date,pdf,body,length,tokens,names,year
15,bp_1977.03.10_doc_V-3,5049,0.471688,1978-01-15,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,". ""* aas_ manoa-to\n/ . _ - . . awm)\n\nsala-1...",6339,"[itaipu, rio, cota, operacao, energia, constru...","[Antonio Azeredo da Silveira, Costa Cavalcanti...",1978
16,bp_1977.09.13_doc_II-2,5049,0.469907,1977-04-15,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,- informação no? 332/76 - estudo sobre a fixaç...,2444,"[itaipu, rio, corpus, cota, operacao, energia,...",[],1977
17,d_1974.03.26_doc_XXXI-44,5049,0.4668,1978-01-15,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\n \n\n \n\nwm\n""mg . .\na"" rio âecreto\n...",6152,"[itaipu, rio, cota, operacao, energia, constru...","[Antonio Azeredo da Silveira, Costa Cavalcanti...",1978
18,bp_1977.09.13_doc_I-14,5049,0.45508,1977-04-15,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"/\n\n \n\n \n\n \n\na\n\n \n\n!\n\n \n\n o:""i...",1690,"[itaipu, rio, corpus, cota, construcao, parana...",[],1977
19,bp_1977.03.10_doc_III-18,5049,0.452564,1977-11-15,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...,"\n\nwma-03.40\njg?\n(é. /, () h\n\n confidenc...",409,"[itaipu, rio, corpus, cota, operacao, construc...",[],1977


# pickle file

In [27]:
topics_dict_file = os.path.join(outputs, 'topics_dict.pkl')

pickle for general data already saved. To overwrite:
```python
pickle.dump(topic_vis_dict, open(topics_dict_file, 'wb'))
```

In [48]:
with open(topics_dict_file, 'rb') as f:
    topic_vis_dict = pickle.load(f)

# Visualization of metadata
obs: produced file might be useless

In [29]:
topics_metadata = pd.DataFrame(columns=['mean_score', 'mean_length'])
count=0
for key, df in topic_vis_dict.items():
    score_mean = df['topic_score'].mean()
    length_mean = df['length'].mean()
    row = pd.DataFrame([[key, score_mean, length_mean]], columns=['key', 'mean_score', 'mean_length'], index=[count])
    topics_metadata = topics_metadata.append(row)
    count+=1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [30]:
topics_metadata.head(2)

Unnamed: 0,key,mean_length,mean_score
0,0.0,520.25,0.651635
1,1.0,158.0,0.139641


In [31]:
topics_metadata_json = []
for index, row in topics_metadata.iterrows():
    temp_dict = {'key': row['key'], 'mean_length': row['mean_length'], 'mean_score': row['mean_score']}
    topics_metadata_json.append(temp_dict)

In [32]:
topics_metadata_json[:2]

[{'key': 0.0, 'mean_length': 520.25, 'mean_score': 0.6516352272112267},
 {'key': 1.0, 'mean_length': 158.0, 'mean_score': 0.13964124338376632}]

In [33]:
json.dump(topics_metadata_json, open('../outputs/topics_metadata.json', 'w'))

# Visualization of a specific topic

list split  in pandas: https://mikulskibartosz.name/how-to-split-a-list-inside-a-dataframe-cell-into-rows-in-pandas-9849d8ff2401

In [35]:
nuclear_brazil_df = topic_vis_dict[7]
nuclear_brazil_df['doc'] = nuclear_brazil_df['doc_id'].apply(lambda text: re.sub('.*(doc.*)', r'\1', text))
cols = nuclear_brazil_df.columns.tolist()
cols = cols[:3] + cols[4:] + cols[3:4]
nuclear_brazil_df = nuclear_brazil_df[cols]
nuclear_brazil_df.tail()

Unnamed: 0,doc_id,topic_id,topic_score,body,length,tokens,names,doc,url
15,pn_1975.00.00_doc_6,7,0.310567,"\n\n \n \n\ná4ó (%?ç'quoicg; vê!/avó m & ""l...",251,"[nuclear, nucleares, brasil, armas, tratado]",[],doc_6,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...
16,pn_1974.08.15_doc_III-31,7,0.303898,secreto-excâusãvo\n\nsubsídios para as consult...,4549,"[nuclear, acordo, energia, nucleares, brasil, ...","[Jimmy Carter, Cyrus Vance, Helmut Schmidt]",doc_III-31,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...
17,pn_1976.12.28_doc_29,7,0.300879,"telegrama recebido '\n\naaaa [97034 . é?\no"" '...",845,"[nuclear, acordo, energia, nucleares, brasil, ...",[Jimmy Carter],doc_29,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...
18,pn_1974.08.15_doc_II-1,7,0.290118,"\n\nmmm\n\n""a. . .. , . ..\n\n' o acordo nucl...",1672,"[nuclear, acordo, energia, nucleares, brasil, ...",[Jimmy Carter],doc_II-1,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...
19,pn_1976.12.28_doc_16,7,0.286759,antonio la'. aiii-1315230 da sxmrâxm\njáwówa i...,330,"[acordo, energia, brasil, uranio, rfa, salvagu...",[Antonio Azeredo Da Silveira],doc_16,http://www.fgv.br/cpdoc/acervo/arquivo-pessoal...


# build main data for each topic

In [36]:
topics = range(100)

In [37]:
dict_topics_json = {}
for topic in topics:
    #rename docs
    dict_topics_json[topic] = topic_vis_dict[topic]
    dict_topics_json[topic]['doc'] = dict_topics_json[topic]['doc_id'].apply(lambda text: re.sub('.*(doc.*)', r'\1', text))
    
    #places date column at the end of df
    cols = dict_topics_json[topic].columns.tolist()
    cols = cols[:3] + cols[4:] + cols[3:4] 
    dict_topics_json[topic] = dict_topics_json[topic][cols]
    
    topic_transposed = dict_topics_json[topic] \
        .drop(["body", "topic_id"], axis = 1) \
        .transpose()
#         .rename(columns={"pdf": "url"}) \

    
    # build json of main data
    topic_json = topic_transposed.to_json(orient='columns')
    topic_json = re.sub('"\d+":', '', topic_json)
    topic_json = re.sub("^{", "[", topic_json)
    topic_json = re.sub("}$", "]", topic_json)
    topic_json = json.loads(topic_json)
    
    # save json
    json_file = 'topic_{}.json'.format(str(topic))
    file_path = os.path.join(outputs, json_file)
    json.dump(topic_json, open(file_path, 'w'))

# define function to melt dataframes

In [38]:
def get_melted_df(df,variable):
    cols_to_drop = set(df.columns) - set(['doc_id'])
    topic = df[variable].apply(pd.Series) \
        .merge(df, left_index = True, right_index = True) \
        .drop(cols_to_drop, axis = 1) \
        .melt(id_vars = ['doc_id'], value_name = variable) \
        .drop("variable", axis = 1) \
        .dropna()
    return topic

# generate json files related to persons
Get connections between docs and persons. Those represent the edges of a graph.

In [39]:
b, a = 2, 1
# a=1
count_min, count_max = 0, 20 #len(docs)
for topic in topics:
    #build list of docs for each person
    melted_df = get_melted_df(topic_vis_dict[topic], 'names')
    melted_df = melted_df.groupby(['names'])['doc_id'].apply(list)
    melted_df = pd.DataFrame({'name':melted_df.index, 'docs':melted_df.values})
    
    # build json of list of names
    names_list = []
    for index, row in melted_df.iterrows():
        name = row['name']
#         name_count = math.log(int(person_doc_count.loc[person_doc_count['name'] == name]['person_count']),1.5)
#         name_count = person_doc_count.loc[person_doc_count['name'] == name]['person_count'].values[0]
        row_docs = row['docs']
        name_count = len(row_docs)
        
        temp_dict = {'name': name, 'count': name_count, 'docs': row_docs}
        names_list.append(temp_dict)
        
    #feature scaling to an arbitrary set of values
    count_list = [i['count'] for i in names_list]
#     if count_list:
#         count_min, count_max = 0, max(count_list)
#         count_min, count_max = 0, len(docs)
#     else: count_min = count_max = 0
    for i in range(len(names_list)):
        #outra opção é argumentar que o mínimo de count é 0
        names_list[i]['count'] = (names_list[i]['count']-count_min)/(count_max-count_min)
#         pass
#         names_list[i]['count'] = (names_list[i]['count']-count_min)/(count_max-count_min)*(b-a)+a
#         names_list[i]['count'] = (names_list[i]['count']-list_avg)/list_std
        
    # save json
    file_path = os.path.join(outputs,'names_list_{}.json'.format(str(topic)))
    json.dump(names_list, open(file_path, 'w'))

# generate json files related to tokens
Get connections between docs and tokens. Those represent the edges of a graph.

In [41]:
for topic in topics:
    #build list of docs for each token
    melted_df = get_melted_df(topic_vis_dict[topic], 'tokens')
    melted_df = melted_df.groupby(['tokens'])['doc_id'].apply(list)
    melted_df = pd.DataFrame({'token':melted_df.index, 'docs':melted_df.values})
    
    #get data from lda_model
    topic_tokens = lda_model.print_topics(-1, num_words=20)[topic]
    topic_tokens = topic_tokens[1].split('+')
    
    #topic_tokens will be a list with elements such as: '0.085*"nuclear" ',
    tokens_list = []
    for pair in topic_tokens:
        #get tokens and scores
        token = re.sub('.*\*"(.*)".*', r'\1', pair)
        token = remove_special_char(token)
        score = re.sub(' *(.*)\*.*', r'\1', pair)
        score = float(score)

        #remove tokens irrelevantes/problematicos
#         if token == 'fins': continue

        #get list of docs
        docs = melted_df[melted_df['token'] == token]['docs'].tolist()
        if docs: 
            docs = docs[0]
            temp_dict = {'token': token, 'score': score, 'docs': docs}
            tokens_list.append(temp_dict)
    file_path = os.path.join(outputs,'tokens_list_{}.json'.format(str(topic)))
    json.dump(tokens_list, open(file_path, 'w'))

# build object for d3 observable vis
temporary section

In [259]:
topics_object = []
for i in range(100):
    temp_topic = {}
    temp_topic['topic'] = 'topic{}'.format(i)
    temp_topic['documents'] = 'await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/topic_{}.json")'.format(i)
    temp_topic['names'] = 'await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/names_list_{}.json")'.format(i)
    temp_topic['tokens'] = 'await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/tokens_list_{}.json")'.format(i)
    topics_object.append(temp_topic)

topics = [{topic: "topic1", documents: await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/topic_1.json"), names: await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/names_list_1.json"),tokens: await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/tokens_list_1.json")},{topic: "topic2", documents: await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/topic_2.json"), names: await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/names_list_2.json"), tokens: await d3.json("https://raw.githubusercontent.com/Marcelobbr/dissertation/master/outputs/tokens_list_2.json")}]