### doc_topics
ESTABELECE RELAÇÕES ENTRE DOCUMENTOS E TÓPICOS NA BASE DE DADOS NO MYSQL. TRABALHAMOS COM O ACERVO **ANTONIO AZEREDO DA SILVEIRA, MINISTÉRIO DAS RELAÇÕES EXTERIORES**.

In [1]:
import nltk
import os
import codecs
import string
from gensim import corpora, models, similarities #Latent Dirichlet Allocation implementation with Gensim
import sqlite3
from IPython.display import clear_output
import pickle

### set user-specific variables
Verifies what operational system is being used and creates user-specific variables. Renato = Linux ; Marcelo = nt (Windows)

Also sets working folders

In [2]:
outputs = os.path.join("..","outputs")
inputs = os.path.join("..", "inputs")
if os.name == 'nt':
    encoding_type = 'utf-8'
    outer_outputs = r'D:\pseudo-dropbox\backups-fgv\azeredo_lda'
else:
    encoding_type = 'ISO-8859-1'

#### Load pickle files

In [3]:
file_corpus = os.path.join(inputs,'LDAcorpus.pkl')
file_dictionary = os.path.join(inputs,'LDAdictionary.pkl')
file_lda = os.path.join(inputs,'model_lda_100_rs_00.pkl')

corpus = pickle.load(open(file_corpus, 'rb'))
dictionary = pickle.load(open(file_dictionary, 'rb'))
lda = pickle.load(open(file_lda, 'rb'))

In [4]:
#missing variables
lda.minimum_phi_value = 0.01
lda.per_word_topics=False
lda.dtype='float'

In [5]:
len(corpus)

9054

In [6]:
lda.print_topics(-1, num_words=5)

[(0,
  '0.069*"transcrito" + 0.051*"observancia" + 0.026*"aé" + 0.018*"vast" + 0.016*"nata"'),
 (1,
  '0.022*"brasil" + 0.020*"governo" + 0.018*"portugal" + 0.015*"áfrica" + 0.015*"angola"'),
 (2,
  '0.039*"chanceler" + 0.032*"telegrama" + 0.027*"serie" + 0.022*"recebido" + 0.015*"exteriores"'),
 (3,
  '0.049*"jeddah" + 0.023*"rené" + 0.017*"gay" + 0.014*"grito" + 0.012*"arara"'),
 (4,
  '0.163*"federal" + 0.068*"câmara" + 0.065*"justiça" + 0.053*"tribunal" + 0.046*"deputados"'),
 (5,
  '0.028*"bittencourt" + 0.026*"aluysio" + 0.024*"borja" + 0.022*"minutada" + 0.018*"én"'),
 (6,
  '0.098*"leurs" + 0.070*"ie" + 0.068*"sra" + 0.041*"cette" + 0.022*"droit"'),
 (7,
  '0.091*"brazil" + 0.066*"government" + 0.056*"international" + 0.046*"brazilian" + 0.043*"foreign"'),
 (8,
  '0.036*"ministro" + 0.028*"relações" + 0.028*"senhor" + 0.025*"exteriores" + 0.020*"brasil"'),
 (9,
  '0.111*"s/a" + 0.096*"ltda" + 0.068*"gestoes" + 0.034*"emprestimo" + 0.026*"goncalves"'),
 (10,
  '0.146*"japão" + 0

In [76]:
lda.print_topics(-1, num_words=5)

[(0,
  '0.051*"excelência" + 0.034*"senhor" + 0.028*"exteriores" + 0.025*"relações" + 0.023*"ministério"'),
 (1,
  '0.023*"torres" + 0.022*"broissa" + 0.017*"anglo" + 0.017*"egp" + 0.017*"arlette"'),
 (2,
  '0.065*"chanceler" + 0.032*"exteriores" + 0.031*"silveira" + 0.031*"brasil" + 0.018*"ministro"'),
 (3,
  '0.037*"presidente" + 0.036*"brasil" + 0.015*"geisel" + 0.014*"disse" + 0.008*"ter"'),
 (4,
  '0.035*"cuba" + 0.027*"venezuela" + 0.027*"oea" + 0.026*"reunião" + 0.024*"chile"'),
 (5,
  '0.056*"senhor" + 0.034*"excelência" + 0.027*"leurs" + 0.019*"senhora" + 0.019*"place"'),
 (6, '0.009*"ar" + 0.007*"ro" + 0.007*"ão" + 0.006*"cc" + 0.006*"ea"'),
 (7,
  '0.085*"nuclear" + 0.036*"acordo" + 0.035*"energia" + 0.030*"nucleares" + 0.023*"brasil"'),
 (8, '0.038*"cl" + 0.031*"ps" + 0.019*"uh" + 0.015*"hr" + 0.012*"ic"'),
 (9,
  '0.075*"silveira" + 0.062*"azeredo" + 0.054*"antonio" + 0.045*"ministro" + 0.026*"exteriores"'),
 (10,
  '0.145*"instituto" + 0.127*"universidade" + 0.032*"ciênci

#### generates the distribution of topics for each new document
Creates a database with data about correlation score between topics and documents.

Notes:
* The documents are selected from the MySQL data base with some filters: main_language = portuguese, phrase estimated readability > 40%.
* 'pt' = portuguese language. 
* In case of documents which weren't possible to detect readability, for being too short (lesse than 10 total sentences), we used the number -1 for their readability definition.

ref for foreign key: https://github.com/davibarreira/FDS-Homeworks/blob/master/hw1/database_app.py

In [78]:
row

('ag_1973.11.20_doc_I-125',
 'pt',
 -1,
 'http://www.fgv.br/cpdoc/acervo/arquivo-pessoal/AAS/textual/documentos-felicitando-azeredo-da-silveira-pela-nomeacao-para-a-pasta-das-relacoes-exteriores-incluindo-agradecimentos-do-titular-as-congratulacoe',
 'mbãoibj/w\n\nmi"?\n\n121680ar cyfâ\n121680ar cvna\n122749ar ebrsl\n22/2/74 1813 oper283\n\nzczc tba352 tpw276102659 201034873\naeba bu brsp 076\nsaopaulo 76 22 1609 p1/57\n\nurgente ,\nexcelentissimo senhor embaixador antonio francisco\nazeredo da silveira brasemb\n\nbuenosaires\n\nnoticia indicação chanceler novo governo nos permite mais\n\numa vez cumprimentar efusivamente e em caráter pessoal\n\nilustre amigo vg convictos pelas privilegiadas oportunidades\n\nque tivemos contatos anteriores vossa excelência vg gestão\n\nsera marco importante vg construtivo e decisivo no desenvolvimento\ndas relações internacionais do pais pt\n\ncol urgent\n\ntpw276 102659 201034873 aeba exc p2/19\n\nrespeitosamente vg flavio mindlin guimaraes vg marklen

### builds topics table

In [93]:
sql_db = os.path.join(outputs, 'cpdoc_as.sqlite')
conn = sqlite3.connect(sql_db)
cur = conn.cursor()
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
inserts data into sql database
captures documents from docs table
creates topic-doc table
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

cur.execute("DROP TABLE IF EXISTS topics")
cur.execute('''CREATE TABLE IF NOT EXISTS topics
           (id SMALLINT(6) PRIMARY KEY, name VARCHAR(250)
           DEFAULT NULL)
           ;''')

topic_name = ''
for i in range(0,100):
    query = "INSERT INTO topics VALUES (?, ?)"
    cur.execute(query, (i, topic_name))

conn.commit()
conn.close()

### builds topic_doc table

In [94]:
sql_db = os.path.join(outputs, 'cpdoc_as.sqlite')
conn = sqlite3.connect(sql_db)
cur = conn.cursor()

user_input = input("Data will be erased and replaced. Continue? Type 'yes' or 'no' on your keyboard: ")
if user_input.lower() == 'yes': 
    cur.execute("DROP TABLE IF EXISTS topic_doc")
    cur.execute('''CREATE TABLE IF NOT EXISTS topic_doc
               (doc_id VARCHAR(31), topic_id smallint(6), topic_score FLOAT,
               FOREIGN KEY(doc_id) REFERENCES docs(id)
               FOREIGN KEY(topic_id) REFERENCES topics(id)
               );''')

    cur.execute("SELECT * FROM docs WHERE main_language = 'pt' AND (readability > 0.4 OR readability = -1) ")  
    data = cur.fetchall()
    numrows = len(data)
    percentil = numrows/100

    # for row in data:
    for count,row in enumerate(data):
        ### mede percentual de conclusão da tarefa ###
        if count % 100 == 0: 
            clear_output()
            print(int(count/percentil),'% done')
    #         if count != 0: break

        text_bow = dictionary.doc2bow(row[4].split())
        score_list = lda[text_bow]
        doc_id = row[0]
        for score in score_list:
            topic_id = str(score[0])
            topic_score = str(score[1])
            query = "INSERT INTO topic_doc VALUES (?, ?, ?)"
            cur.execute(query, (doc_id, topic_id, topic_score))

else:
    print('Table was not created/replaced')
    
conn.commit()
conn.close()

99 % done


### builds topic_doc table from new lda (2020)

In [94]:
sql_db = os.path.join(outputs, 'cpdoc_as.sqlite')
conn = sqlite3.connect(sql_db)
cur = conn.cursor()

cur.execute("DROP TABLE IF EXISTS topic_doc_new")
cur.execute('''CREATE TABLE IF NOT EXISTS topic_doc_new
           (doc_id VARCHAR(31), topic_id smallint(6), topic_score FLOAT,
           FOREIGN KEY(doc_id) REFERENCES docs(id)
           FOREIGN KEY(topic_id) REFERENCES topics(id)
           );''')

cur.execute("SELECT * FROM docs WHERE main_language = 'pt' AND (readability > 0.4 OR readability = -1) ")  
data = cur.fetchall()
numrows = len(data)
percentil = numrows/100

# for row in data:
for count,row in enumerate(data):
    ### mede percentual de conclusão da tarefa ###
    if count % 100 == 0: 
        clear_output()
        print(int(count/percentil),'% done')
#         if count != 0: break

    text_bow = dictionary.doc2bow(row[4].split())
    score_list = lda[text_bow]
    doc_id = row[0]
    for score in score_list:
        topic_id = str(score[0])
        topic_score = str(score[1])
        query = "INSERT INTO topic_doc_new VALUES (?, ?, ?)"
        cur.execute(query, (doc_id, topic_id, topic_score))

conn.commit()
conn.close()

99 % done
