Combine LDA and locally created word2vec library.

In [19]:
import gensim
import gensim.corpora as corpora
from gensim.models import word2vec
import numpy as np
from collections import Counter
import aira_processor_dict as apd
import pickle

Load in an AiraProcessor instance and extract the corpus from it.

In [7]:
ap = apd.AiraProcessor(part_of_speech='all', process_type='asis', min_occurrences=25)
with open('aira_processor_instances/' + ap.output_file_name() + '.p', 'rb') as file:
    ap = pickle.load(file)
    
texts = ap.output_corpus()

Train a local word2vec model.

In [8]:
size_word=100
model_wv=word2vec.Word2Vec(texts, size=size_word,workers=2,min_count=1,iter=10)

Create Mallet LDA model (creating Dictionary and id corpus first). Then convert to a gensim lda model.

In [9]:
# Create Dictionary
id2word = corpora.Dictionary(texts)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

np.random.seed(100)
size_lda = 14
mallet_path = 'mallet-2.0.8/mallet-2.0.8/bin/mallet' 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=size_lda, id2word=id2word)

# transform mallet to lda
lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet, gamma_threshold=0.001, iterations=50)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Get the weight of each word in the topic.

In [10]:
the_id=[]
the_vl=[]
the_w =[]

for x in range(size_lda):
    the_id.append([xx[0] for xx in lda.get_topic_terms(x,topn=5)])
    the_sum=sum([xx[1] for xx in lda.get_topic_terms(x,topn=5)])
    the_w.append([xx[1]/the_sum for xx in lda.get_topic_terms(x,topn=5)])  

Map the topic to the word2vec space.

In [13]:
# Mapping the topic to the word2vec space
m=0
the_wv=np.zeros([size_lda,size_word])

for it in the_id:
    n=0
    for it_id in it:
        word_t=id2word[it_id]
        #print (word_t+"**",np.shape(model_wv[word_t]),the_w[m][n])
        the_wv[m]+=[x_word*the_w[m][n] for x_word in model_wv[word_t]]
        n+=1
    m+=1
doc_word=np.zeros([len(texts),size_word])


  # Remove the CWD from sys.path while we load stuff.


Map the documents to the word2vec space.

In [15]:
#Mapping the document to the word2vec space
m=0
for each_doc in texts:
    for each_word in each_doc:
        #print each_word
        doc_word[m]+=model_wv[each_word]
    n=0
    for doc_word_each in doc_word[m]:
        doc_word[m][n]=doc_word[m][n]/len(doc_word[m])
        n+=1
    m+=1


  


Calculate the distance between each document each topic.

In [17]:
def destince(a,b):
    dt=0
    for each_dt in range(len(a)):
        dt+=(a[each_dt]-b[each_dt])*(a[each_dt]-b[each_dt])
    return np.sqrt(dt)
doc_t=np.zeros([len(doc_word),size_lda])
m=0
for each_d in doc_word:
    n=0
    for each_t in the_wv:
        doc_t[m][n]=destince(each_d,each_t)
        n+=1
    m+=1

Make the nearest topic to a document its dominant topic and then count the results. The challenge we were having with combining word2vec and LDA was that it concentrates the documents in one topic.

In [20]:
topic = [np.argmin(i) for i in doc_t]

print(Counter(topic))

Counter({10: 5536, 2: 950, 1: 320, 6: 125, 4: 99, 11: 59, 8: 49, 3: 49, 0: 32, 5: 16, 7: 8, 12: 5, 9: 5, 13: 4})
