## Problem Set 4
## Machine Learning  
## PPHA 30545  
## Yinjiang Xiong  
## Due: March 10

In [70]:
import pandas as pd
import glob
import os
import pickle
import re
import scipy.sparse
from gensim import matutils, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

(a) Load the data as a corpus. 

In [2]:
file_list = glob.glob(os.path.join(os.getcwd(), 'SimpleText_auto', '*.txt'))

corpus = []

for file_path in file_list:
    with open(file_path) as f_input:
        corpus.append(f_input.read())


In [3]:
vec = CountVectorizer()
X = vec.fit_transform(corpus)
dtm = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

In [4]:
# document-term matrix
dtm.head(5)

Unnamed: 0,00,000,0000,00003,00004,0001,0001g,00025t,00037,0005g,...,ℓn2qeyℓ,ℓw,ℓϵ,ⅆi1ⅆt,ⅆi2ⅆt,ⅆi3ⅆt,ⅆjˆⅆmδm,ⅆjⅆm,ⅆudcⅆt,ⅇ1
0,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,17,3,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(b) Clean the data. This implies transforming all characters to lowercase and removing stop words, punctuation, and any other words that will not generate meaningful content for identifying the topics. Think about words that are likely common in academic papers (e.g., table, figure, results). Also think about combining forms of the same word (e.g., genes and gene). Be sure to justify your decisions.


In [39]:
# first, convert all words to lowercase, strip accents and remove the top 20% most frequent and keep words with at least 10 appearances
# setting max and min helps get rid of the meaningless words that appear a lot and obscure words that appear less than 10 times
# this significantly reduced features from 30718 to 2966
vec_1 = CountVectorizer(lowercase=True, strip_accents='unicode', max_df=0.8, min_df=10)
X_1 = vec_1.fit_transform(corpus)
dtm_1 = pd.DataFrame(X_1.toarray(), columns=vec_1.get_feature_names())

In [40]:
dtm_1.head(5)

Unnamed: 0,000,001,005,01,02,03,04,05,06,07,...,yielding,yields,young,zeiss,zero,zhang,zone,zones,δt,μm
0,3,0,0,1,0,0,0,0,0,0,...,0,16,0,0,0,0,1,8,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,6,0,0,8,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,...,0,0,2,0,0,0,6,10,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,4,0,0,3,4,2,1,...,1,3,0,0,3,0,0,0,0,0


In [41]:
len(vec_1.stop_words_)

27697

In [42]:
# delete academic words
academic = ['table', 'tables', 'figure', 'figures', 'result', 'results', 'hypothesis', 'hypotheses', 'research', 'yield', 'et']
for feature in dtm_1:
    if feature in academic:
        dtm_1.drop(feature, axis=1, inplace=True)

In [43]:
# delete words that contain non-letter
for feature in dtm_1:
    if not feature.isalpha():
        dtm_1.drop(feature, axis=1, inplace=True)

In [44]:
dtm_1.head(5)

Unnamed: 0,ability,able,abnormal,about,above,absence,absent,absolute,absorbed,absorption,...,yielding,yields,young,zeiss,zero,zhang,zone,zones,δt,μm
0,0,0,0,4,3,0,0,0,0,0,...,0,16,0,0,0,0,1,8,0,0
1,0,0,0,0,4,0,0,0,0,0,...,0,6,0,0,8,0,0,0,0,0
2,0,0,0,1,1,1,1,0,0,0,...,0,0,2,0,0,0,6,10,0,0
3,2,0,0,1,3,0,0,0,0,7,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,1,6,0,0,10,0,0,...,1,3,0,0,3,0,0,0,0,0


In [45]:
# finally, combine words in singular and plural forms
for feature in dtm_1:
    if feature+'s' in dtm_1:
        dtm_1[feature] = dtm_1[feature] + dtm_1[feature+'s']
        dtm_1.drop([feature+'s'], axis=1, inplace=True)

In [46]:
dtm_1.head(5)

Unnamed: 0,ability,able,abnormal,about,above,absence,absent,absolute,absorbed,absorption,...,yet,yielding,yields,young,zeiss,zero,zhang,zone,δt,μm
0,0,0,0,4,3,0,0,0,0,0,...,0,0,16,0,0,0,0,9,0,0
1,0,0,0,0,4,0,0,0,0,0,...,0,0,6,0,0,8,0,0,0,0
2,0,0,0,1,1,1,1,0,0,0,...,0,0,0,2,0,0,0,16,0,0
3,2,0,0,1,3,0,0,0,0,7,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,1,6,0,0,10,0,0,...,2,1,3,0,0,3,0,0,0,0


(c) Present the 50 most frequently used words in the corpus in an informative way. This can include a table of results or a word cloud.

In [47]:
freq = []
for feature in dtm_1:
    freq.append((feature, dtm_1[feature].sum()))
freq_sort_50 = [(k, v) for k, v in sorted(freq, key=lambda item: item[1], reverse=True)][:50]
freq_sort_50

[('model', 1864),
 ('time', 1424),
 ('cell', 1375),
 ('value', 1303),
 ('flow', 1185),
 ('system', 1130),
 ('case', 1117),
 ('set', 1081),
 ('level', 1060),
 ('soil', 981),
 ('section', 941),
 ('state', 922),
 ('energy', 917),
 ('temperature', 909),
 ('surface', 852),
 ('region', 843),
 ('field', 841),
 ('electron', 829),
 ('change', 804),
 ('condition', 798),
 ('rate', 749),
 ('will', 733),
 ('increase', 727),
 ('line', 720),
 ('node', 708),
 ('sample', 702),
 ('particle', 695),
 ('observed', 692),
 ('low', 689),
 ('effect', 680),
 ('ratio', 677),
 ('algorithm', 675),
 ('function', 669),
 ('use', 644),
 ('size', 642),
 ('wind', 641),
 ('difference', 632),
 ('since', 630),
 ('group', 618),
 ('network', 610),
 ('site', 610),
 ('method', 609),
 ('mean', 607),
 ('factor', 585),
 ('see', 582),
 ('let', 571),
 ('gene', 569),
 ('ion', 560),
 ('depth', 556),
 ('given', 551)]

(d) Fit a topic model on the corpus setting k equal to 2, 3, 5, 8, and 10. Present the topics for each value of k and interpret the topics. In your opinion, which of the selected values of k yield the most meaningful coherence for each topic?

In [48]:
tdm = dtm_1.transpose()
tdm.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,100,101,102,103,104,105,106,107,108,109
ability,0,0,0,2,0,0,2,0,0,0,...,0,1,1,1,0,0,0,15,2,0
able,0,0,0,0,3,0,0,1,3,0,...,1,3,1,0,0,0,0,4,6,0
abnormal,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
about,4,0,1,1,1,0,10,11,0,0,...,0,10,2,5,1,5,1,4,25,0
above,3,4,1,3,6,4,4,37,1,0,...,1,10,1,1,13,2,8,4,31,0


In [49]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus_gen = matutils.Sparse2Corpus(sparse_counts)

In [50]:
id2word = {}
id = 0
for word in dtm_1:
    id2word[id] = word
    id += 1

In [51]:
# k = 2
lda = models.LdaModel(corpus=corpus_gen, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.011*"model" + 0.008*"flow" + 0.007*"case" + 0.007*"set" + 0.006*"energy" + 0.006*"time" + 0.006*"value" + 0.006*"electron" + 0.005*"particle" + 0.005*"field"'),
 (1,
  '0.011*"cell" + 0.008*"soil" + 0.006*"state" + 0.006*"level" + 0.005*"node" + 0.005*"sample" + 0.005*"time" + 0.005*"system" + 0.005*"gene" + 0.004*"value"')]

topic 1: chemistry  
topic 2: biology

In [52]:
# k = 3
lda = models.LdaModel(corpus=corpus_gen, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.011*"wind" + 0.011*"particle" + 0.010*"let" + 0.009*"case" + 0.009*"set" + 0.008*"element" + 0.008*"group" + 0.007*"power" + 0.006*"model" + 0.006*"theorem"'),
 (1,
  '0.010*"time" + 0.010*"state" + 0.010*"node" + 0.010*"model" + 0.010*"algorithm" + 0.008*"flow" + 0.007*"set" + 0.007*"system" + 0.006*"value" + 0.006*"module"'),
 (2,
  '0.009*"cell" + 0.007*"soil" + 0.006*"model" + 0.006*"temperature" + 0.006*"electron" + 0.005*"energy" + 0.005*"surface" + 0.005*"level" + 0.004*"value" + 0.004*"sample"')]

topic 1: physics theory  
topic 2: test  
topic 3: environment

In [53]:
# k = 5
lda = models.LdaModel(corpus=corpus_gen, id2word=id2word, num_topics=5, passes=10)
lda.print_topics()

[(0,
  '0.011*"surface" + 0.010*"particle" + 0.008*"layer" + 0.008*"flow" + 0.008*"feature" + 0.008*"value" + 0.007*"model" + 0.007*"image" + 0.006*"measurement" + 0.006*"sample"'),
 (1,
  '0.013*"state" + 0.013*"node" + 0.012*"algorithm" + 0.011*"time" + 0.010*"model" + 0.007*"system" + 0.007*"module" + 0.007*"network" + 0.007*"power" + 0.006*"memory"'),
 (2,
  '0.019*"set" + 0.015*"let" + 0.010*"function" + 0.009*"case" + 0.009*"theorem" + 0.009*"clusters" + 0.008*"lemma" + 0.008*"element" + 0.007*"since" + 0.007*"section"'),
 (3,
  '0.012*"model" + 0.008*"temperature" + 0.007*"value" + 0.007*"flow" + 0.007*"change" + 0.006*"rate" + 0.005*"region" + 0.005*"energy" + 0.005*"site" + 0.005*"ratio"'),
 (4,
  '0.019*"cell" + 0.011*"soil" + 0.011*"electron" + 0.008*"gene" + 0.006*"plant" + 0.006*"energy" + 0.005*"expression" + 0.005*"level" + 0.005*"ion" + 0.005*"field"')]

topic 1: experiment  
topic 2: theory  
topic 3: ?  
topic 4: variables  
topic 5: environmental science

In [54]:
# k = 8
lda = models.LdaModel(corpus=corpus_gen, id2word=id2word, num_topics=8, passes=10)
lda.print_topics()

[(0,
  '0.020*"soil" + 0.010*"plant" + 0.009*"patient" + 0.007*"species" + 0.007*"level" + 0.007*"root" + 0.007*"protein" + 0.006*"gene" + 0.006*"sample" + 0.006*"activity"'),
 (1,
  '0.030*"cell" + 0.012*"algorithm" + 0.009*"gene" + 0.009*"expression" + 0.008*"clusters" + 0.007*"system" + 0.007*"model" + 0.006*"size" + 0.006*"memory" + 0.006*"line"'),
 (2,
  '0.013*"sediment" + 0.009*"ratio" + 0.009*"device" + 0.008*"distribution" + 0.008*"ti" + 0.007*"energy" + 0.006*"selection" + 0.006*"heavy" + 0.006*"risk" + 0.006*"core"'),
 (3,
  '0.015*"set" + 0.012*"let" + 0.011*"flow" + 0.010*"feature" + 0.010*"function" + 0.008*"case" + 0.008*"group" + 0.008*"image" + 0.008*"theorem" + 0.007*"section"'),
 (4,
  '0.017*"electron" + 0.013*"energy" + 0.010*"wind" + 0.010*"ion" + 0.009*"region" + 0.007*"field" + 0.007*"power" + 0.007*"case" + 0.006*"speed" + 0.006*"channel"'),
 (5,
  '0.027*"node" + 0.019*"state" + 0.014*"module" + 0.014*"time" + 0.010*"network" + 0.010*"element" + 0.009*"ideal" 

topic 1: environmental science
topic 2: genetic biology  
topic 3: ?  
topic 4: academic noun  
topic 5: renewable energy  
topic 6: ?  
topic 7: site  
topic 8: controlled variables

In [55]:
# k = 10
lda = models.LdaModel(corpus=corpus_gen, id2word=id2word, num_topics=10, passes=10)
lda.print_topics()

[(0,
  '0.026*"model" + 0.019*"temperature" + 0.013*"atmosphere" + 0.010*"heating" + 0.010*"line" + 0.009*"region" + 0.009*"energy" + 0.009*"rate" + 0.009*"observation" + 0.009*"solar"'),
 (1,
  '0.022*"algorithm" + 0.019*"model" + 0.018*"memory" + 0.016*"core" + 0.012*"performance" + 0.012*"element" + 0.010*"resource" + 0.010*"concentration" + 0.009*"machine" + 0.009*"trace"'),
 (2,
  '0.035*"cell" + 0.014*"gene" + 0.010*"expression" + 0.009*"patient" + 0.008*"culture" + 0.008*"human" + 0.006*"level" + 0.006*"line" + 0.005*"protein" + 0.005*"day"'),
 (3,
  '0.019*"state" + 0.019*"node" + 0.013*"time" + 0.012*"set" + 0.011*"algorithm" + 0.010*"system" + 0.009*"module" + 0.008*"network" + 0.007*"will" + 0.006*"value"'),
 (4,
  '0.032*"soil" + 0.014*"plant" + 0.011*"species" + 0.008*"root" + 0.007*"site" + 0.007*"change" + 0.007*"effect" + 0.006*"organic" + 0.006*"treatment" + 0.006*"water"'),
 (5,
  '0.035*"electron" + 0.021*"energy" + 0.017*"ion" + 0.014*"field" + 0.013*"wave" + 0.010*

topic 1: solar  
topic 2: model  
topic 3: biology    
topic 4: network  
topic 5: environmental science  
topic 6: chemistry  
topic 7: physics  
topic 8: theory  
topic 9: material science  
topic 10: material science

Overall, 10 topics give a somewhat clear description for each topic.

(e) Optimize the hyperparameters of the LDA model using 10-fold cross-validation. Present the topics from the best model and explain your results. 

In [58]:
# tune alpha and beta
# no similar package in python to perform cv on lda in gensim
lda = models.LdaModel(corpus=corpus_gen, id2word=id2word, alpha='auto', eta='auto', num_topics=10, passes=10)
lda.print_topics()

[(0,
  '0.009*"feature" + 0.008*"model" + 0.008*"flow" + 0.007*"layer" + 0.007*"depth" + 0.006*"site" + 0.006*"device" + 0.006*"section" + 0.006*"change" + 0.006*"time"'),
 (1,
  '0.009*"surface" + 0.009*"image" + 0.008*"film" + 0.007*"group" + 0.007*"observed" + 0.007*"material" + 0.007*"solution" + 0.006*"peak" + 0.006*"value" + 0.006*"feature"'),
 (2,
  '0.032*"particle" + 0.013*"value" + 0.013*"temperature" + 0.012*"concentration" + 0.011*"sample" + 0.010*"element" + 0.008*"layer" + 0.008*"material" + 0.007*"surface" + 0.007*"size"'),
 (3,
  '0.020*"model" + 0.015*"temperature" + 0.011*"clusters" + 0.011*"line" + 0.010*"case" + 0.009*"heating" + 0.008*"coherent" + 0.008*"atmosphere" + 0.008*"profile" + 0.008*"region"'),
 (4,
  '0.030*"cell" + 0.007*"human" + 0.007*"level" + 0.007*"patient" + 0.007*"culture" + 0.007*"gene" + 0.007*"sediment" + 0.007*"expression" + 0.006*"sample" + 0.006*"day"'),
 (5,
  '0.017*"node" + 0.016*"state" + 0.015*"algorithm" + 0.013*"time" + 0.010*"network

In [60]:
lda = models.LdaModel(corpus=corpus_gen, id2word=id2word, alpha='asymmetric', eta='auto', num_topics=10, passes=10)
lda.print_topics()

[(0,
  '0.026*"state" + 0.019*"node" + 0.015*"algorithm" + 0.015*"time" + 0.013*"model" + 0.013*"module" + 0.013*"memory" + 0.011*"system" + 0.010*"machine" + 0.008*"will"'),
 (1,
  '0.039*"cell" + 0.017*"gene" + 0.011*"expression" + 0.009*"culture" + 0.009*"human" + 0.008*"patient" + 0.007*"plant" + 0.007*"level" + 0.006*"protein" + 0.006*"line"'),
 (2,
  '0.010*"value" + 0.009*"time" + 0.009*"cost" + 0.009*"resource" + 0.008*"model" + 0.008*"flow" + 0.008*"equation" + 0.008*"network" + 0.007*"algorithm" + 0.007*"node"'),
 (3,
  '0.012*"sediment" + 0.010*"site" + 0.009*"concentration" + 0.009*"value" + 0.008*"change" + 0.007*"core" + 0.007*"sea" + 0.007*"element" + 0.006*"section" + 0.006*"sample"'),
 (4,
  '0.032*"soil" + 0.010*"feature" + 0.008*"network" + 0.008*"image" + 0.008*"species" + 0.007*"root" + 0.007*"plant" + 0.007*"treatment" + 0.006*"system" + 0.006*"level"'),
 (5,
  '0.031*"electron" + 0.019*"energy" + 0.013*"ion" + 0.010*"surface" + 0.009*"layer" + 0.009*"signature" +

In [79]:
# supporting function
def compute_coherence_values(corpus, id2word, a, b):
    
    lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass')
    
    return coherence_model_lda.get_coherence()

In [80]:
print(compute_coherence_values(corpus=corpus_gen, id2word=id2word, a=0.01, b=0.01))

-0.8331870384097492


In [83]:
print(compute_coherence_values(corpus=corpus_gen, id2word=id2word, a=0.1, b=0.1))

-0.8297662915408827


In [84]:
print(compute_coherence_values(corpus=corpus_gen, id2word=id2word, a=0.3, b=0.3))

-0.9133419300680116


In [85]:
print(compute_coherence_values(corpus=corpus_gen, id2word=id2word, a=0.5, b=0.5))

-0.8982664297476737


In [86]:
print(compute_coherence_values(corpus=corpus_gen, id2word=id2word, a=0.7, b=0.7))

-0.8995639178131924


In [87]:
print(compute_coherence_values(corpus=corpus_gen, id2word=id2word, a=1, b=1))

-0.8990662658247104


To be frank, I don't think the class has prepared us for this question. I have to educate myself on the previous few questions, but I have no clue how to do this one in Python. I think even if I could use a package to jump to the conclusion in R, I wouldn't know what that meant.