In [1]:
import numpy as np
from gensim.models import LsiModel
from gensim.models import TfidfModel
from gensim.similarities import MatrixSimilarity
from sklearn.metrics.pairwise import cosine_similarity
from utility.postgres_manager import *
import gensim
from ml_classes.mm_with_meta import *
corpus_path = "./gensim_data/updated_subjects/politic-ai-corpus.mm"
trained_path = "./gensim_data/updated_subjects/trained/politic-ai-trained.tr"
n_topics = 500
n_clusters = 100

postgresManager = PostgresManager()



connecting to PostgreSQL database...


connection established
PostgreSQL 10.3 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.3 20140911 (Red Hat 4.8.3-9), 64-bit


In [2]:
dictionary = gensim.corpora.Dictionary.load(corpus_path+".dict")
corpus = MmCorpusMeta(corpus_path, id2word=dictionary, metadata=True)
print(corpus)

MmCorpus(1054 documents, 23876 features, 526706 non-zero entries)


In [3]:
from gensim.models import KeyedVectors
word2vec_model = KeyedVectors.load('./word2vec-pretrained/gensim.dutch')


In [4]:
word2vec_model.init_sims(replace=True)

In [5]:
def document_vector(word2vec_model, doc, dict):
    # remove out-of-vocabulary words
    doc = [dict[word[0]] for word in doc if dict[word[0]] in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)


In [6]:
tfidf = TfidfModel(corpus, id2word=dictionary)
corpus_tfidf = tfidf[corpus]
index = gensim.similarities.MatrixSimilarity(tfidf[corpus])
sims = index[corpus_tfidf]


In [7]:

tfidf.save(trained_path + ".tfidf")
gensim.utils.pickle(corpus_tfidf,trained_path+".tfidf.corpus")
index.save(trained_path+"tfidf.corpus.index")
gensim.utils.pickle(sims,trained_path+"tfidf.corpus.index.sims")

In [8]:
sims = {'politicai': {}}
lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)
lsi_index = MatrixSimilarity(lsi[corpus_tfidf])
sims['politicai']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]]
                                     for i in range(len(corpus))])

In [9]:
lsi.save(trained_path+".lsi")
lsi_index.save(trained_path+".lsi.index")

In [10]:
sims['politicai']['centroid'] = cosine_similarity(np.array([document_vector(word2vec_model, doc, dictionary)
                                                            for doc in corpus]))

In [11]:
def most_similar(i, X_sims, topn=None):
    """return the indices of the topn most similar documents with document i
    given the similarity matrix X_sims"""

    r = np.argsort(X_sims[i])[::-1]
    if r is None:
        return r
    else:
        return r[:topn]

#LSI
print(most_similar(0, sims['politicai']['LSI'], 20))

#Centroid
print(most_similar(0, sims['politicai']['centroid'], 20))


[  0 804 424 778 777 776 765 754 477 479 480 743 742 486 487 734 492 495
 498 501]
[  0 477 877 682 424 854 631 649 849 847 845 842 479 664 480 628 625 486
 487 492]


In [12]:
from sklearn.cluster import SpectralClustering
def spectral_clustering(n_clusters, method):
    n_clusters = n_clusters
    sc = SpectralClustering(n_clusters=n_clusters,
                            affinity='precomputed')
    matrix = sims['politicai'][method]

    sc.fit(matrix)
    print("Method: {}".format(method))
    print(sc.labels_[1::10])
    print("-"*10)
    return sc

In [13]:
sc_lsi = spectral_clustering(n_clusters=n_clusters, method="LSI")
count_dic = dict.fromkeys(sc_lsi.labels_, 0)
for label in sc_lsi.labels_:
    count_dic[label] += 1
print(count_dic)

Method: LSI
[ 1 29 27 22 46 26 44 18 86 64 98 95 39 94 70 50 50 96 77 38 97 48 44 97 67
 18 76  9  2  7 16 27  2 40  6 24 35  2  6 23 11 36 45  6 14 31 28  0  1  3
  2  1  1  3  3  3  2  1  1  3  2  1  3  2  1  3  2  1  3  5 53 61 15 57 56
 13  6 13  1  5 29  6  9 33 22  1  6  5 17 24  3  5  2 13  4  1  1 13  2 11
  6 14 31 13  4  2]
----------
{0: 3, 1: 119, 2: 119, 3: 119, 4: 12, 5: 25, 6: 28, 7: 28, 8: 3, 9: 30, 10: 18, 11: 24, 12: 9, 13: 28, 14: 11, 15: 3, 16: 18, 17: 28, 18: 17, 19: 3, 20: 9, 21: 8, 22: 6, 23: 12, 24: 12, 25: 12, 26: 4, 27: 10, 28: 3, 29: 28, 30: 3, 31: 6, 32: 3, 33: 12, 34: 6, 35: 8, 36: 8, 37: 2, 38: 2, 39: 3, 40: 10, 41: 3, 42: 3, 43: 5, 44: 11, 45: 5, 46: 3, 47: 8, 48: 3, 49: 3, 50: 5, 51: 4, 52: 3, 53: 8, 54: 3, 55: 2, 56: 2, 57: 3, 58: 3, 59: 3, 60: 2, 61: 5, 62: 3, 63: 3, 64: 6, 65: 3, 66: 3, 67: 2, 68: 3, 69: 8, 70: 6, 71: 5, 72: 2, 73: 4, 74: 3, 75: 2, 76: 5, 77: 14, 78: 4, 79: 4, 80: 2, 81: 2, 82: 2, 83: 2, 84: 3, 85: 2, 86: 2, 87: 5, 88: 3, 89: 3, 90: 3

In [14]:
sc_word2vec = spectral_clustering(n_clusters=n_clusters, method="centroid")
count_dic = dict.fromkeys(sc_lsi.labels_, 0)
for label in sc_word2vec.labels_:
    count_dic[label] += 1
print(count_dic)

Method: centroid
[ 3 12 68 75 30 15  2  2  2  2  2  3  1 62  2 46  1  2  2 57  2  3  2  3 61
  3  2 35  1 16 43 68  1 19 32  9 40  1 32 10 22  4  6 32 18 80 34 58  3  2
  1  3  3  2  2  2  1  3  3  2  1  3  2  1  3  2  1  3  2  0 33 45 61 41  2
 21 32 21  3  0 12 32 35 11 75  3 32  0  8  9  2  0  1 21 28  3  3 21  1 22
 32 18 80 21 28  1]
----------
{0: 25, 1: 134, 2: 205, 3: 162, 4: 8, 5: 3, 6: 5, 7: 3, 8: 28, 9: 12, 10: 12, 11: 12, 12: 28, 13: 18, 14: 1, 15: 3, 16: 28, 17: 4, 18: 11, 19: 9, 20: 8, 21: 28, 22: 24, 23: 1, 24: 9, 25: 1, 26: 1, 27: 3, 28: 12, 29: 2, 30: 3, 31: 4, 32: 28, 33: 8, 34: 3, 35: 30, 36: 3, 37: 1, 38: 1, 39: 9, 40: 8, 41: 3, 42: 3, 43: 18, 44: 2, 45: 5, 46: 1, 47: 8, 48: 3, 49: 1, 50: 1, 51: 3, 52: 3, 53: 1, 54: 1, 55: 3, 56: 3, 57: 2, 58: 3, 59: 3, 60: 1, 61: 8, 62: 7, 63: 1, 64: 1, 65: 1, 66: 1, 67: 3, 68: 10, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 6, 76: 3, 77: 3, 78: 1, 79: 3, 80: 12, 81: 1, 82: 1, 83: 1, 84: 1, 85: 1, 86: 1, 87: 1, 88: 1, 89: 1, 90: 

In [15]:
subject_dict = []
doc_metadata = corpus.iter_meta()
for _, meta in corpus.doc_metadata.items():
    s_dict = {}
    s_dict['sub_id'] = meta['metadata']['id']
    s_dict['doc_id'] = meta['id']
    s_dict['seq_id'] = meta['corpus_seq_id']
    cluster = {}
    cluster['lsi'] = sc_lsi.labels_[meta['corpus_seq_id']]
    cluster['word2vec'] = sc_word2vec.labels_[meta['corpus_seq_id']]
    s_dict['spec_clust'] = cluster
    subject_dict.append(s_dict)



In [16]:
def create_clusters(algorithm_name,subject_dict):
    cluster_docs = {
        label: {'documents':[]} for label in sc_lsi.labels_
    }
    for meta in subject_dict:
        label = meta['spec_clust'][algorithm_name]
        doc_dic = {'doc_id': meta['doc_id'], 'sub_id': meta['sub_id'], 'doc_uniq_words': len(corpus[meta['seq_id']]),
                   'seq_id': meta['seq_id']}
        cluster_docs[label]['documents'].append(doc_dic)
    return cluster_docs
cluster_docs_word2vec = create_clusters('word2vec', subject_dict)
cluster_docs_lsi = create_clusters('lsi',subject_dict)

In [17]:

def attach_title_to(cluster_docs, postgresManager):
    for label, cluster in cluster_docs.items():
        max_uniq_words_sub_id = max(cluster['documents'], key=lambda x: x['doc_uniq_words'])
        subject = postgresManager.select(
            "Select * FROM politicalai_ict.subject where id = {0};".format(max_uniq_words_sub_id['sub_id']))
        postgresManager.commit_changes()
        cluster_docs[label]['title'] = subject[0][1]
    return cluster_docs
#cluster_docs_word2vec = attach_title_to(cluster_docs_word2vec, postgresManager)
cluster_docs_lsi = attach_title_to(cluster_docs_lsi, postgresManager)


In [18]:
def insert_cluster(cluster_docs, postgresManager, algorithm_name):
    for label, cluster in cluster_docs.items():
        postgresManager.insert_with_args(
            "insert into politicalai_ict.dossier(id,name,description,onlineid,ml_algorithm,clustering_algorithm)"
            " values(DEFAULT ,%s,NULL,NULL,%s,%s) RETURNING id",
            (cluster['title'], algorithm_name, 'spectral-clustering'))
        postgresManager.commit_changes()
        cluster_id = postgresManager.cursor.fetchone()[0]
        cluster['database_cluster_id'] = cluster_id
        for doc in cluster['documents']:
            postgresManager.insert_with_args(
                "insert into politicalai_ict.subject_to_dossier(rel_id,subject_id,dossier_id,ml_algorithm,clustering_algorithm)"
                " values(DEFAULT ,%s,%s,%s,%s) RETURNING rel_id",
                (doc['sub_id'], cluster_id, algorithm_name, 'spectral-clustering'))
            postgresManager.commit_changes()
            try:
                rel_id = postgresManager.cursor.fetchone()[0]
                doc['database_rel_id'] = rel_id
            except psycopg2.ProgrammingError:
                # take some other action
                print("Error inserting doc {0} from topic {1}".format(doc['sub_id'], label))


In [25]:
insert_cluster(cluster_docs_word2vec, postgresManager, 'word2vec')
postgresManager.commit_changes()


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None


None
None
None


None
None
None


None
None
None
None


None
None
None


None
None
None


None
None
None
None
None


None
None
None
None


None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None
None
None
None


None
None
None


In [21]:
insert_cluster(cluster_docs_lsi, postgresManager, 'lsi')
postgresManager.commit_changes()


In [53]:
query = "Mooi zegd mien jong’, zeggen we dan in Groningen, maar praktijk is dat minister het met de NAM doet, de verbinding… https://t.co/7dgkkNH3P2"


def tokenize(document):
    tokens = list(gensim.utils.tokenize(document, lower=True))
    return tokens

copy_dictionary = gensim.corpora.Dictionary.load(corpus_path + ".dict")
copy_dictionary.filter_extremes(no_below=1, no_above=0.5)
bow = copy_dictionary.doc2bow(tokenize(query))
sims_td = tfidf[bow]
sims_td_ind = index[bow]
sims_lsi = lsi_index[lsi[sims_td]]
sims_lsi_sorted = sorted(enumerate(sims_lsi), key=lambda item: -item[1])
sims_td_sorted = sorted(enumerate(sims_td_ind), key=lambda item: -item[1])


In [54]:
print(sims_td_sorted[0:5])


[(218, 0.12537736), (2, 0.12504685), (47, 0.11184342), (506, 0.10252175), (197, 0.10093142)]


In [59]:
corpus.doc_metadata[218]['metadata']['id']

362

In [60]:
print(sims_lsi_sorted[0][1])

0.494469
