In [31]:
from utility.postgres_manager import *
postgresManager = PostgresManager()

In [32]:
from flask_app.utility.documents_gensim import *
import numpy as np
corpus_path = "./gensim_data/updated_subjects/politic-ai-corpus.mm"
dictionary_path = corpus_path + ".dict"
dictionary = gensim.corpora.Dictionary.load(dictionary_path)
corpus = MmCorpusMeta(corpus_path, id2word=dictionary, metadata=True)

In [33]:
dossiers_subjects = postgresManager.select(
    "SELECT dossier_id,subject_id FROM politicalai_ict.subject_to_dossier")


In [34]:
dossier_map = {value[0]: [] for value in dossiers_subjects}
for value in dossiers_subjects:
    dossier_map[value[0]].append(value[1])

In [35]:
def expand_doc2bow(vec1, vec2):
    index1 = 0
    index2 = 0
    if len(vec1) == 0:
        vec1 = vec2
        return vec1
    while index1 < len(vec1) and index2 < len(vec2):
        if vec1[index1][0] == vec2[index2][0]:
            vec1[index1] = (vec1[index1][0], vec1[index1][1] + vec2[index2][1])
            index1 += 1
            index2 += 1
        else:
            vec1.append(vec2[index2])
            index2 += 1
    return vec1

dossier_corpus = []
dossier_meta = {}
for key, value in dossier_map.items():
    composed_doc = []
    dossier_meta[len(dossier_corpus)] = key
    for doc in value:
        if doc in corpus.postgres_id_to_doc_id.keys():
            composed_doc = expand_doc2bow(composed_doc, corpus[corpus.postgres_id_to_doc_id[doc]])
    dossier_corpus.append(composed_doc)


In [36]:
gensim.corpora.MmCorpus.serialize("./gensim_data/dossier_corpus.mm", dossier_corpus)

In [37]:
corpus = gensim.corpora.MmCorpus("./gensim_data/dossier_corpus.mm")

In [38]:
import gensim
from sklearn.metrics.pairwise import cosine_similarity
tfidf = TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]
index = gensim.similarities.MatrixSimilarity(tfidf[corpus])
sims = index[corpus_tfidf]
sims = {'politicai': {}}
n_topics = 100
lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)
lsi_index = gensim.similarities.MatrixSimilarity(lsi[corpus_tfidf])
sims['politicai']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]]
                                     for i in range(len(corpus))])






In [39]:
def most_similar(i, X_sims, topn=None):
    """return the indices of the topn most similar documents with document i
    given the similarity matrix X_sims"""

    r = np.argsort(X_sims[i])[::-1]
    if r is None:
        return r
    else:
        return r[:topn]

#LSI
print(most_similar(0, sims['politicai']['LSI'], 20))



[  0 114  31 231 140 118 170   1 181 191 128 235  45  23 111 105 204 173
 136 206]


In [40]:
postgresManager.commit_changes()
for i in range(0, len(corpus)):
    similar = most_similar(i, sims['politicai']['LSI'], 20)
    left_dossier_id = dossier_meta[i]
    for similar_id in similar:
        right_dossier_id = dossier_meta[similar_id]
        if right_dossier_id != left_dossier_id:
            postgresManager.insert_with_args(
                "insert into politicalai_ict.related_dossiers(id,left_dossier_id,right_dossier_id)"
                " values(DEFAULT ,%s,%s)",
                (left_dossier_id, right_dossier_id))
postgresManager.commit_changes()

In [11]:
import csv

from utility.document_streaming import *
documentStreaming = PostgresStreaming()

categories_meta = []
categories_keywords = {}
with open('./utility/files/categories_keywords.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    next(readCSV, None)
    for row in readCSV:
        categories_keywords[row[0]] = []
with open('./utility/files/categories_keywords.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    next(readCSV, None)
    for row in readCSV:
        categories_keywords[row[0]].append(row[2])


In [12]:
for key, value in categories_keywords.items():
    meta = {'id':key, 'text':documentStreaming.tokenize(documentStreaming.process_document(value))}
    categories_meta.append(meta)

In [13]:
categories_corpus = []
for key, value in categories_keywords.items():
    categories_meta[len(categories_corpus)] = key
    categories_corpus.append(dictionary.doc2bow(value))


In [14]:
gensim.corpora.MmCorpus.serialize("./gensim_data/categories_corpus.mm", categories_corpus)
categories_corpus = gensim.corpora.MmCorpus("./gensim_data/categories_corpus.mm")

In [15]:
tfidf_cat = TfidfModel(categories_corpus, id2word=dictionary)
corpus_tfidf_cat = tfidf[categories_corpus]
index_cat = gensim.similarities.MatrixSimilarity(tfidf_cat[categories_corpus])

In [20]:
def getCategoriesForDoc(tfidf,index,doc):
    td = tfidf[doc]
    similar = index[td]
    r = np.argsort(similar)[::-1]
    catIds = [value for value in r if value>0.15][:3]
    if(len(catIds)==0):
        catIds = r[:1]
    return catIds

In [31]:
postgresManager.commit_changes()
for i in range(0,len(corpus)):
    ids = getCategoriesForDoc(tfidf_cat,index_cat,corpus[i])
    for id in ids:
        postgresManager.insert_with_args(
            "insert into politicalai_ict.dossier_category(id,dossier_id,category_id)"
            " values(DEFAULT ,%s,%s)",
            (dossier_meta[i], categories_meta[int(id)]))
postgresManager.commit_changes()

In [17]:
td = tfidf_cat[corpus[0]]
similar = index_cat[td]
r = np.argsort(similar)[::-1]

In [26]:
postgresManager.commit_changes()
votes_postgres = postgresManager.select(
    "SELECT name,subject_id FROM politicalai_ict.vote")
postgresManager.commit_changes()

In [27]:
import csv
from utility.document_streaming import *
documentStreaming = PostgresStreaming()
votes_corpus = []
votes_meta = {}
for row in votes_postgres:
    votes_meta[len(votes_corpus)] = row[1]
    votes_corpus.append(dictionary.doc2bow(documentStreaming.tokenize(row[0])))
gensim.corpora.MmCorpus.serialize("./gensim_data/votes_corpus.mm", votes_corpus)
categories_corpus = gensim.corpora.MmCorpus("./gensim_data/votes_corpus.mm")



In [28]:
votes_meta

{0: 5400,
 1: 5406,
 2: 5415,
 3: 5419,
 4: 5420,
 5: 5464,
 6: 5432,
 7: 5450,
 8: 5453,
 9: 5455,
 10: 5462,
 11: 5463,
 12: 5485,
 13: 5465,
 14: 5466,
 15: 5467,
 16: 5474,
 17: 5476,
 18: 5486,
 19: 5487,
 20: 5488,
 21: 5489,
 22: 5494,
 23: 5507,
 24: 5499,
 25: 5506,
 26: 5511,
 27: 5515,
 28: 5517,
 29: 5522,
 30: 5539,
 31: 5565,
 32: 5565,
 33: 5565,
 34: 5571,
 35: 5571,
 36: 5580}

In [29]:
for i in range(0,len(votes_corpus)):
    td = tfidf[votes_corpus[i]]
    similar = lsi_index[lsi[td]]
    r = np.argsort(similar)[::-1]
    dossIds = [value for value in r if value>0.15][:10]
    if(len( dossIds)==0):
        dossIds = r[:1]
    postgresManager.insert_with_args(
        "insert into politicalai_ict.subject_to_dossier(rel_id,subject_id,dossier_id,ml_algorithm,clustering_algorithm)"
        " values(DEFAULT ,%s,%s,%s,NULL) RETURNING rel_id",
        (votes_meta[i], dossier_meta[i], 'lsi', ))
postgresManager.commit_changes()