In [1]:
import numpy as np
import scipy.sparse as ss

import pandas as pd

from corextopic import corextopic as ct
from corextopic import vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

import gensim
from sklearn.datasets import fetch_20newsgroups
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary

import os
import numpy as np
import pandas as pd
import pickle
import pyLDAvis
import pyLDAvis.gensim_models


from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

  from collections import Mapping, MutableMapping
  from collections import Mapping, MutableMapping
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


## Anchored Correlation Explanation:Topic Modeling with Minimal Domain Knowledge

In [2]:
import matplotlib.pyplot as plt
def describe_training_documents(list_of_docs):
    print('There are',len(list_of_docs),'documents.')
    document_lengths = list(map(lambda x: len(x.split()),list_of_docs))
    percentile_50 = int(np.percentile(document_lengths,50))
    percentile_95 = int(np.percentile(document_lengths,95))
    print('95% of the documents are below:',percentile_95,'words.')
    plt.axvline(percentile_50, lw=1, color='g')
    plt.axvline(percentile_95, lw=1, color='r', linestyle='--')
    _ = plt.hist(document_lengths, bins=50, range=(0,percentile_95+100))
    print('Solid green line indicates median, dotted red line indicates 95 percentile. Outliers may be cropped.')

def flatten_list(l):
    return [item for sublist in l for item in sublist]

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]


In [313]:
# Get 20 newsgroups data
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
documents_train = list(np.load('./training/train.npy')) # historical materials 4451 documents
documents_train2 = list(np.load('./training/train2.npy'))   # census bureau 4226 documents
df_occsc = pd.read_csv('OCC_pairs.csv').rename(columns={'OCC_DES':'Full Occupation'})
assert(df_occsc['Full Occupation'].nunique() == len(df_occsc))
occ_list = list(set(list(df_occsc['Full Occupation'])))

In [316]:
with open("nyt_index.csv",'r') as f:
    nyt_text=[]
    lines=f.readlines()
    for line in lines:
        if len(line.split(" "))>70:
            nyt_text.append(line.split('###')[1])

describe_training_documents(nyt_text)

There are 12351 documents.
95% of the documents are below: 110 words.
Solid green line indicates median, dotted red line indicates 95 percentile. Outliers may be cropped.


In [317]:
with open("nyt_text_modified.txt",'r') as f:
    nyt_text2=[]
    lines=f.readlines()
    for line in lines:
        nyt_text2.append(line)
        
describe_training_documents(nyt_text2)


There are 4348 documents.
95% of the documents are below: 66 words.
Solid green line indicates median, dotted red line indicates 95 percentile. Outliers may be cropped.


In [318]:
# include both training and testing dataset into the vectorizer
# but fit the model with the training dataset
# corex model requires them to be the same shape 
documents = []
documents.extend(documents_train)
documents.extend(documents_train2)
documents.extend(nyt_text2)

document_total=documents[:]
document_total.extend(occ_list)
print(len(documents))
print("In the dataset there are", len(document_total), "textual documents")
print("And this is the first one:\n", documents[0])
print(len(occ_list))

13025
In the dataset there are 13461 textual documents
And this is the first one:
 Skip to main content Search UPLOAD SIGN UP | LOG IN BOOKS VIDEO AUDIO SOFTWARE IMAGESABOUT BLOG PROJECTS HELP DONATE  CONTACT JOBS VOLUNTEER PEOPLE Search Metadata Search text contents Search TV news captions Search radio transcripts Search archived websitesAdvanced SearchSign up for freeLog inFull text of "The practical cabinet maker and furniture designer's assistant, with essays on history of furniture, taste in design, color and materials, with full explanation of the canons of good taste in furniture .."See other formats^ 
436


The topic model assumes input is in the form of a doc-word matrix, where rows are documents and columns are binary counts. We'll vectorize the dataset, take the top 10,000 words, and convert it to a sparse matrix to save on memory usage. Note, we use binary count vectors as input to the CorEx topic model.

### Transform data into a sparse matrix

In [319]:

vectorizer = CountVectorizer(stop_words='english', max_features=10000, binary=True)
doc_word = vectorizer.fit_transform(document_total)
doc_word = ss.csr_matrix(doc_word)

doc_word.shape # n_docs x m_words

(13461, 10000)

In [320]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names()))

In [321]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names()))
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words = [word for ind,word in enumerate(words) if not word.isdigit()]

doc_word.shape

(13461, 9166)

In [323]:
# Train the CorEx topic model with 50 topics
topic_model = ct.Corex(n_hidden=50, words=words, max_iter=200, verbose=False, seed=1)
topic_model.fit(doc_word[:len(documents)], words=words);



In [324]:
# Print a single topic from CorEx topic model
topic_model.get_topics(topic=1, n_words=10)

[('products', 0.1888025255654615, 1.0),
 ('value', 0.17104432833212274, 1.0),
 ('establishments', 0.159174169125996, 1.0),
 ('industry', 0.13722594964816293, 1.0),
 ('total', 0.12616774927774987, 1.0),
 ('reported', 0.10721105271683387, 1.0),
 ('statistics', 0.1003487152742275, 1.0),
 ('manufacture', 0.08186723375545649, 1.0),
 ('census', 0.04899813201153747, 1.0),
 ('figures', 0.046699065427696224, 1.0)]

In [325]:
# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: wood,piece,inch,surface,dry,glue,mold,cut,cement,sides
1: products,value,establishments,industry,total,reported,statistics,manufacture,census,figures
2: fig,furniture,design,pieces,chairs,legs,designs,cabinet,panels,chair
3: govt,repts,com,urges,pres,ct,gen,ussr,natl,rept
4: earners,number,wage,cent,average,employed,increase,increased,proportion,largest
5: add,hot,boil,boiling,mix,vegetables,milk,cooking,varnish,pint
6: york,new,pennsylvania,jersey,north,south,ohio,illinois,state,district
7: union,local,members,international,locals,organization,convention,unions,employers,executive
8: style,century,art,taste,french,marquetry,louis,gothic,boulle,ebony
9: age,engaged,occupations,gainful,females,persons,sex,males,years,occupational
10: precinct,patrolmen,annum,assignments,duty,division,appointed,18th,patrolman,effect
11: yesterday,today,washington,national,department,street,july,association,john,aug
12: mind,life,borne,people,worker,brought,facts,influence,human,question
13: market,exc

In [326]:
results=topic_model.predict(doc_word[len(documents):])
results.shape

(436, 50)

In [336]:
def get_predict_result(results):
    pairs=[]
    for i in range(len(results)):
        for j in range(len(results[i])):
            if results[i][j]==True:
                pairs.append([i,j])
    return pairs

def count_topics(pairs):
    available={}
    for a,b in pairs:
        if b not in available.keys():
            available[b]=1
        else:
            available[b]+=1
    return available 

def get_topic_content(pairs,topic):
    result=[]
    for a,b in pairs:
        if b == topic:
            result.append(occ_list[a])
    return result 
    

### Total Corelation and Model Selection

In [288]:
topic_model.tc 

12.45757782459769

In [289]:
topic_model.tcs.shape

(30,)

In [290]:
print(np.sum(topic_model.tcs))
print(topic_model.tc)

12.45757782459769
12.45757782459769


In [272]:
#Selecting number of topics:Choosing from the data visualised
plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16);
plt.savefig('Distribution of TCs for each topic', dpi=600)

### Pointwise Document TC

In [219]:
topic_model.log_z.shape # n_docs x k_topics

(8677, 30)

In [86]:
print(np.mean(topic_model.log_z, axis=0)) #The pointwise total correlations in log_z represent the correlations within an individual document explained by a particular topic. These correlations have been used to measure how "surprising" documents are with respect to given topics
print(topic_model.tcs)

[1.34974539 0.83554806 0.70903774 0.70545575 0.68693023 0.56250323
 0.53726093 0.51397045 0.50238426 0.49800946 0.44917444 0.41939602
 0.4013495  0.37549139 0.35551906 0.3394628  0.33001975 0.31568205
 0.29639085 0.28862444 0.27634022 0.27239563 0.24001884 0.23892719
 0.23133722 0.22984241 0.22585549 0.21901736 0.21107025 0.20957803
 0.19759806 0.19475075 0.18146505 0.17683385 0.17142717 0.1658273
 0.160654   0.15995043 0.15710332 0.15364033 0.14336239 0.1335164
 0.12901163 0.12075866 0.10256559 0.10245789 0.09818597 0.09731013
 0.07496047 0.04938526]
[1.34974539 0.83554806 0.70903774 0.70545575 0.68693023 0.56250323
 0.53726093 0.51397045 0.50238426 0.49800946 0.44917444 0.41939602
 0.4013495  0.37549139 0.35551906 0.3394628  0.33001975 0.31568205
 0.29639085 0.28862444 0.27634022 0.27239563 0.24001884 0.23892719
 0.23133722 0.22984241 0.22585549 0.21901736 0.21107025 0.20957803
 0.19759806 0.19475075 0.18146505 0.17683385 0.17142717 0.1658273
 0.160654   0.15995043 0.15710332 0.15364

### Introducing Anchoring in the semi-supervised topic mode

CorEx is a discriminative model, whereas LDA is a generative model. This means that while LDA outputs a probability distribution over each document, CorEx instead estimates the probability a document belongs to a topic given that document's words. As a result, the probabilities across topics for a given document do not have to add up to 1. The estimated probabilities of topics for each document can be accessed through log_p_y_given_x or p_y_given_x.

Hierarchical Topic Models
The labels attribute gives the binary topic expressions for each document and each topic. We can use this output as input to another CorEx topic model to get latent representations of the topics themselves. This yields a hierarchical CorEx topic model. Like the first layer of the topic model, one can determine the number of latent variables to add in higher layers through examination of the topic TCs.

Anchored CorEx is an extension of CorEx that allows the "anchoring" of words to topics. When anchoring a word to a topic, CorEx is trying to maximize the mutual information between that word and the anchored topic. So, anchoring provides a way to guide the topic model towards specific subsets of words that the user would like to explore.

1. Anchoring a single set of words to a single topic. This can help promote a topic that did not naturally emerge when running an unsupervised instance of the CorEx topic model. For example, one might anchor words like "snow," "cold," and "avalanche" to a topic if one suspects there should be a snow avalanche topic within a set of disaster relief articles.

2. Anchoring single sets of words to multiple topics. This can help find different aspects of a topic that may be discussed in several different contexts. For example, one might anchor "protest" to three topics and "riot" to three other topics to understand different framings that arise from tweets about political protests.

3. Anchoring different sets of words to multiple topics. This can help enforce topic separability if there appear to be chimera topics. For example, one might anchor "mountain," "Bernese," and "dog" to one topic and "mountain," "rocky," and "colorado" to another topic to help separate topics that merge discussion of Bernese Mountain Dogs and the Rocky Mountains.

In [331]:
#to automatically generate anchor words: for each label in a data set, 
#we find the words that have the highest mutual information with the label.
# we took a very simple to automatically generate the anchor words to create a semi-supervised model
anchor_words=[]
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    anchor_words.append(list(topic_words[:3]))

anchor_words

[['wood', 'piece', 'inch'],
 ['products', 'value', 'establishments'],
 ['fig', 'furniture', 'design'],
 ['govt', 'repts', 'com'],
 ['earners', 'number', 'wage'],
 ['add', 'hot', 'boil'],
 ['york', 'new', 'pennsylvania'],
 ['union', 'local', 'members'],
 ['style', 'century', 'art'],
 ['age', 'engaged', 'occupations'],
 ['precinct', 'patrolmen', 'annum'],
 ['yesterday', 'today', 'washington'],
 ['mind', 'life', 'borne'],
 ['market', 'exchange', 'prices'],
 ['operations', 'individual', 'avoid'],
 ['ladies', 'garment', 'shop'],
 ['work', 'good', 'time'],
 ['carved', 'carving', 'bronze'],
 ['table', 'shows', 'according'],
 ['court', 'war', 'american'],
 ['steel', 'iron', 'rolling'],
 ['long', 'high', 'like'],
 ['tho', 'materials', 'cost'],
 ['world', 'old', 'home'],
 ['water', 'turpentine', 'oil'],
 ['trade', 'present', 'physical'],
 ['law', 'act', 'decision'],
 ['goods', 'woolen', 'worsted'],
 ['nearest', 'december', 'month'],
 ['conditions', 'comparison', 'earlier'],
 ['technology', 'inst

In [332]:
# Anchor 'nasa' and 'space' to first topic, 'sports' and 'stadium' to second topic, so on...
#anchor_words = [['industry', 'manufacture','worker'], ['professional','skilled','technology'], ['politics', 'government'], ['domestic','service']]

anchored_topic_model = ct.Corex(n_hidden=50, seed=2)
anchored_topic_model.fit(doc_word[:len(documents)], words=words, anchors=anchor_words, anchor_strength=6);
len(words)
doc_word.shape[1]



9166

In [333]:
for n in range(len(anchor_words)):
    topic_words,_,_ = zip(*anchored_topic_model.get_topics(topic=n))
    print('{}: '.format(n) + ','.join(topic_words))

0: wood,piece,inch,mold,sides,surface,cut,cement,edges,inches
1: establishments,products,value,industry,manufacture,statistics,manufactured,added,included,ownership
2: furniture,fig,design,designs,examples,chair,construction,figs,artistic,designed
3: govt,com,repts,urges,pres,ct,ussr,natl,conf,gen
4: number,wage,earners,employed,average,employees,year,salaried,proprietors,classifies
5: add,hot,boil,boiling,till,vegetables,let,cold,soft,milk
6: new,york,pennsylvania,jersey,ohio,massachusetts,illinois,wisconsin,city,connecticut
7: union,members,local,international,locals,membership,unions,cloakmakers,officers,strike
8: style,art,century,decoration,taste,cabinet,ivory,gothic,louis,renaissance
9: engaged,age,occupations,persons,gainful,sex,primarily,females,years,occupational
10: precinct,annum,patrolmen,assignments,duty,division,appointed,18th,patrolman,effect
11: today,yesterday,washington,ap,announced,aug,died,street,association,national
12: life,mind,borne,estimate,described,obtained,i

In [334]:
print(np.sum(anchored_topic_model.tcs))
print(anchored_topic_model.tc)

58.12767619300558
58.12767619300558


In [335]:
results=topic_model.predict(doc_word[13025:])
results.shape

(436, 50)

In [340]:
pairs=get_predict_result(results)
print(count_topics(pairs))
print( get_topic_content(pairs,17))

{36: 192, 30: 97, 40: 52, 11: 81, 17: 17, 16: 15, 21: 18, 45: 54, 41: 33, 25: 11, 9: 28, 0: 4, 39: 13, 13: 7, 15: 25, 37: 13, 1: 3, 33: 3, 20: 14, 2: 2, 42: 19, 14: 6, 22: 2, 26: 3, 23: 4, 27: 9, 8: 5, 6: 2, 7: 4, 48: 11, 4: 2, 34: 5, 5: 2, 32: 1, 19: 4, 24: 1, 44: 1, 12: 1, 31: 1, 29: 1}
['Trade Bankers, brokers, and money lenders Commercial brokers and commission men', 'Domestic and Personal Service Porters, except in stores Restaurant, cafe, and lunch room keepers', 'Domestic and Personal Service Porters, except in stores Porters, domestic and professional service', 'Trade Bankers, brokers, and money lenders Loan brokers and loan company officials', 'Trade Bankers, brokers, and money lenders Pawnbrokers', 'Manufacturing and Mechanical Industries Laborers, other metal industries Tinware, enamel-ware, etc., factories', 'Trade Bankers, brokers, and money lenders Brokers not specified and promoters', 'Trade Retail dealers Curios, antiques, and novelties', 'Trade Bankers, brokers, and mo

In [344]:
def get_topic_list(pairs):
    result={}
    for a,b in pairs:
        if b not in result.keys():
            result[b]=[a]
        else:
            result[b].append(a)
    return result

def save_topic(pairs):
    result_dic=get_topic_list(pairs)
    with open("train_result.txt",'w') as f:
        for m,n in result_dic.items():
            topic_words,_,_ = zip(*anchored_topic_model.get_topics(topic=m))
            title=str(m)+":"+(','.join(topic_words))
            f.write(title+'\n')
            for file in n:
                occ_title=occ_list[file]
                f.write(occ_title+'\n')
            f.write("\n \n \n")

save_topic(pairs)
            
