LDA Model Fitting and Analysis
* Fit an LDA model to a corpus of patent claims_abstract
* Identify the topics discovered by the model in terms of their most important words, and we want to use the model to predict the topic probability distribution for a given document.


In [None]:
import os
import pandas as pd 
# Plotting libraries
%matplotlib inline
import matplotlib.pyplot as plt

import json

import gensim
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary
os.chdir('/Users/sheeroh/Box Sync/2_projects/insightDSNYC/data/model')
#print ("We load our dictionary : %s"% type(dictionary))

In [None]:
#load the saved pickle file patent document
patdocs = pd.read_pickle( '../patdocs_clean.pkl')

#check if all data has been successfully loaded
#patdocs.head(2)

In [None]:
num_patents = patdocs["abstract"].size
print(num_patents)

with open('../clean_abstract.txt', 'r') as infile:
    clean_abstract=json.load(infile)

In [None]:
from gensim.models.phrases import Phrases
# Identify Bigrams using gensim's Phrases function

#bigram = Phraser(phrases)
bigram = models.Phrases(clean_abstract, delimiter=b'_')
print(bigram)

final_abstract = []
for i in range(0,num_patents):
    sent = clean_abstract[i] 
    temp_bigram = bigram[sent]
    final_abstract.append(temp_bigram)
    #if i%10000==0:print(i)
final_column = pd.Series(final_abstract)

In [None]:
final_column = pd.Series(final_abstract)
#print(final_column)
patdocs['final_column']= final_column.values
#patdocs.drop(['Unnamed: 0'], axis=1)

In [None]:
import random
import pickle
random.seed(7)# 42 is not always teh answer, let's try something different :)
train_set = random.sample(list(range(0,len(final_abstract))),len(final_abstract)-1000)
test_set = [x for x in list(range(0,len(final_abstract))) if x not in train_set]

train_texts = [final_abstract[i] for i in train_set]
test_texts = [final_abstract[i] for i in test_set]

pickle.dump([train_set,test_set,train_texts,test_texts],open('../pat_abstract_train_test_sets_new.pkl','wb'))

#### Jaccard Coefficient to Determine Number of Topics

In [None]:
# Copy this for JS
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return float(len(intersection))/float(len(union))

topicnums = [1,5,10,15,20,30,40,50]
dictionary = corpora.Dictionary(train_texts)
pickle.dump(dictionary,open('./abstract_ldamodels_bow_dictionary_new.pkl','wb'))

### Train the model

In [None]:
#train the model
corpus = [dictionary.doc2bow(text) for text in train_texts]

ldamodels_bow = {}
for i in topicnums:
    random.seed(42)
    %time ldamodels_bow[i] = models.ldamodel.LdaModel(corpus,num_topics=i,id2word=dictionary)
    ldamodels_bow[i].save('./ldamodels_bow_'+str(i)+'_new.lda')

In [None]:
# Find the words in topics to determine a good number of topics to use
topicnums = [1,5,10,15,20,30,40,50]

lda_topics = {}
for i in topicnums:
    lda_model = models.ldamodel.LdaModel.load('./ldamodels_bow_'+str(i)+'_new.lda')
    lda_topics_string = lda_model.show_topics(i)
    lda_topics[i] = ["".join([c if c.isalpha() else " " for c in topic[1]]).split() for topic in lda_topics_string]

pickle.dump(lda_topics,open('./abstract_lda_bow_topics_new.pkl','wb'))

In [None]:
# use JS to find similarity between topics- Copy the 
lda_stability = {}
for i in range(0,len(topicnums)-1):
    jacc_sims = []
    for t1,topic1 in enumerate(lda_topics[topicnums[i]]):
        sims = []
        for t2,topic2 in enumerate(lda_topics[topicnums[i+1]]):
            sims.append(jaccard_similarity(topic1,topic2))    
        jacc_sims.append(sims)    
    lda_stability[topicnums[i]] = jacc_sims
    
pickle.dump(lda_stability,open('./abstract_lda_bow_stability.pkl','wb'))

In [None]:
import numpy as np
# visualization
import seaborn as sns 
plt.style.use('fivethirtyeight')
#sns.set_style("white")

plt.rcParams['figure.figsize'] = (8,4) 
#plt.rcParams['axes.titlesize'] = 'large'
topicnums = [1,5,10,20,30,40]

#lda_stability = pickle.load(open('./abstract_lda_bow_stability_new.pkl','rb'))
mean_stability = [np.array(lda_stability[i]).mean() for i in topicnums]

with sns.axes_style("white"):
    x = topicnums
    y = mean_stability
    plt.plot(x,y,label='Average Overlap Between Topics')
    plt.xlim([0, 30])
    plt.ylim([0, 0.3])
    plt.xlabel('Number of topics')
    plt.ylabel('Average Jaccard similarity')   
    plt.title('Average Jaccard Similarity Between Topics')
    plt.legend()    
    plt.show()
    # from the plot select number of optimal number of topics

### Optimal Model - Load optimal # of topics into model

In [None]:
#reload the lda model 
import pickle
num_topics = 10
lda_model = models.ldamodel.LdaModel.load('./ldamodels_bow_'+str(num_topics)+'_new.lda')
doc_dict = pickle.load(open('./abstract_ldamodels_bow_dictionary_new.pkl','rb'))

In [None]:
#print out the top words in each topic and dump it into a pickle file. 
lda_topics = lda_model.show_topics(num_topics)
lda_topics_words = ["".join([c if c.isalpha() else " " for c in topic[1]]).split() for topic in lda_topics]
lda_topics_disp = [("topic "+str(i)+": ")+" ".join(topic) for i,topic in enumerate(lda_topics_words)]
print(lda_topics_disp)
#already saved
#pickle.dump(lda_topics_disp,open('./abstract_lda_bow_topics_new.pkl','wb'))

### group topic extraction

In [None]:
#patdocs.drop(['Unnamed: 0'], axis =1)

In [None]:
dictionary = pickle.load(open('./abstract_ldamodels_bow_dictionary_new.pkl','rb'))
group_doc = {2010:[], 2011:[], 2012:[], 2013:[], 2014:[], 2015:[]}
group_doc[2010] = patdocs.loc[patdocs['year'] ==2010]['final_column'].tolist()
group_doc[2011] = patdocs.loc[patdocs['year'] ==2011]['final_column'].tolist()
group_doc[2012] = patdocs.loc[patdocs['year'] ==2012]['final_column'].tolist()
group_doc[2013] = patdocs.loc[patdocs['year'] ==2013]['final_column'].tolist()
group_doc[2014] = patdocs.loc[patdocs['year'] ==2014]['final_column'].tolist()
group_doc[2015] = patdocs.loc[patdocs['year'] ==2015]['final_column'].tolist()


In [None]:
#topic vectors
group_topics = {}
for i in group_doc.keys():
    doc_corp = dictionary.doc2bow(group_doc[i][0])
    doc_prob = lda_model[doc_corp]
    #print(doc_prob)
    
    topic_prob = [0 for j in range(num_topics)]
    for prob in doc_prob:
        topic_prob[prob[0]] = prob[1]
    group_topics[i] = topic_prob

#pickle.dump(group_topics, open('./abstract_lda_bow_topics_new2.pkl','wb')) 
print(group_topics)

In [None]:
y_topics = {}
for i in range(num_topics):
    y_topics[i] = []
    for j in group_doc.keys():
        y_topics[i].append(group_topics[j][i])
print(y_topics)
x = [ 2010, 2011, 2012, 2013, 2014, 2015]

In [None]:
for i in range(num_topics):
    plt.plot(x, y_topics[i], label="topic "+str(i))
plt.legend()
plt.xlabel('Year')
plt.ylabel('Probability distribution')   
plt.title('Topic Variation from years 2010-2015')
plt.xticks(x, x)
plt.show()

In [None]:
for i in range(num_topics):
    print (lda_topics_disp[i])

In [None]:
print(lda_topics[1])
print (lda_topics_words[1])

### Visualization- Save this topic distribution in a format that D3plus visualizations 

 Next ,we will have to save this topic distribution in a format that D3plus visualizations can read. 
 * This is a JSON file with an array structure. 
 * Each element of the array is a dictionary. 
 * Each of these dictionaries hold exactly exactly the same set of keys, but they have
 different values.
 
First, we will need to create a shorter set of topic descriptors, since the default 10 words per topic is too much. We have this configruation into the lda_topics_disp variable, as well as create a new one with only the first 4 words per topic, under lda_topics_disp2.

In [None]:
lda_topics = lda_model.show_topics(num_topics)
lda_topics_words = ["".join([c if c.isalpha() else " " for c in topic[1]]).split() for topic in lda_topics]
lda_topics_disp = [("topic "+str(i)+": ")+" ".join(topic) for i,topic in enumerate(lda_topics_words)]
n=4
lda_topics_words2 = ["".join([c if c.isalpha() else " " for c in topic[1]]).split()[:n] for topic in lda_topics]
lda_topics_disp2 = [" ".join(topic) for i,topic in enumerate(lda_topics_words2)]
lda_topics_disp2

In [None]:
print (lda_topics_words2[1])

In [None]:
print (lda_topics_words[1])

Next, we will create a python array, data and push the values into it. 

*Remember the D3plus format requirements? Each array item has to be a dictionary with the same set of key-value pairs. 
* Here we use the y_topics variable we created previously and cycle through all topics and all years. 
* Each topic-year combination will yield a unique value for the proportion of that certain topic in that particular year - and therefore an entry in our data vector.

We store the year values under the year key, and the values under the value key. Furthermore, we store the numerical value of the topic under topic_id, as well as two version of the descriptive topic labels, the 10-word version under the key topic_name, and the 4-word version under topic_name2.

In [None]:
os.chdir('/Users/sheeroh/Box Sync/2_projects/insightDSNYC/data/d3plus')
data=[]
for topic in range(len(y_topics)):
    for year in range(len(x)):
        data.append({"year":x[year],
                     "value":y_topics[topic][year],
                     "topic_id":topic,"topic_name":lda_topics_disp[topic],
                     "topic_name2":lda_topics_disp2[topic]})
with open('d3plus'+str(num_topics)+'.json', 'w') as outfile:
    json.dump(data, outfile)

## Some questions??

1. *NJ*: I see the files are generated, but when i try to visualize through the server, it doesn't work. and I am not sure what I am doing wrong

2. Why gensim as opposed to scikit-learn? I thnk of gensim as being used for word2vec, but apparently there is an LDA module for it, too-- I didn't know that!
Why LDA? (as opposed to NMF, for example?)

3. Did you try others and this gave you the best result? 
4. How did you model perform on a hold-out (test) set of patents? Was it able to assign topics?