### Topic Significance Ranking
- Based on: [Topic Significance Ranking of LDA Generative Models (Alsumait et al.)](https://mimno.infosci.cornell.edu/info6150/readings/ECML09_AlSumaitetal.pdf).
- Data aquisition from [this website](https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#3importnewsgroupstextdata) verbatum, with prints removed.


In [41]:
import numpy as np
from scipy.stats import entropy
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize


def TopicSignificanceRanking(count_vector, components, documents):
    '''
    This takes the topics from an LDA model (sklearn) and assigns TSR scores to it.
    
        count_vector: the count vector of the words used in the sklearn model.
    
        components:   the topics generated by the LDA model.
        
        RETURNS TSR for each topic
    
    It is highly probable that this could be done easier than below, and indeed, highly likely.  I was doing this as I read through the paper, and therefore was thinking in terms of matching the text, not in terms of efficiency, except where blindingly obvious.
    Darin LaSota, 5/30/2018
    '''
    # Derived Quantities
    topics = components.shape[0]  # number of topics
    measures = 3
    measure = ['KL','COR','COS']
    
    # Distributions...
    # W-Uniform is a junk topic in which all the terms of the dictionary are equally probable
    W_Uniform = np.ones(components.shape)/count_vector.shape[1]
    # As an aside, would probability that word appears in the entire corpus be useful?
    
    # the vacuous semantic distribution (WVacuous), is deﬁned to be the empirical distribution of the sample set
    W_Vacuous = components / components.sum(axis=1)[:, np.newaxis]
    
    # the background topic (D-BGround) is found equally probable in all the documents.
    D_BGround = np.ones(documents.shape)/documents.shape[0]
    
    # Construct U, V, and B for each topic k
    U = np.zeros((topics, measures))
    V = np.zeros((topics, measures))
    B = np.zeros((topics, measures))
   
    for k in range(topics):
        # KL = 0
        U[k, 0] = entropy(components[k,:], W_Uniform[k,:])
        V[k, 0] = entropy(components[k,:],  W_Vacuous[k,:])
        # Averaging distance of each document's word to D-BGround
        B[k, 0] = np.mean(entropy(np.array(documents[:,k]), D_BGround[:,k]))

        # COR
        U[k, 1] = np.correlate(components[k,:], W_Uniform[k,:])
        V[k, 1] = np.correlate(components[k,:],  W_Vacuous[k,:])
        # Averaging distance of each document's word to D-BGround
        B[k, 1] = np.mean(np.correlate(documents[:,k], D_BGround[:,k], mode='valid'))

        # COS
        U[k, 2] = cosine(components[k,:].reshape(-1,1), W_Uniform[k,:].reshape(-1,1))
        V[k, 2] = cosine(components[k,:].reshape(-1,1),  W_Vacuous[k,:].reshape(-1,1))
        # Averaging distance of each document's word to D-BGround
        B[k, 2] = np.mean(cosine(documents[:,k].reshape(-1,1),D_BGround[:,k].reshape(-1,1)))
    
    # 4.1 Standardization Proceedure
    # (10) and (11)
    U1 = np.zeros((topics, measures))
    V1 = np.zeros((topics, measures))
    B1 = np.zeros((topics, measures))
    U2 = np.zeros((topics, measures))
    V2 = np.zeros((topics, measures))
    B2 = np.zeros((topics, measures))
    for k in range(topics):
        for m in range(measures):
            # (10)
            U1[k,m] = U[k,m] * (1 - U[k,m]/np.sum(U[:,m], axis=0))
            V1[k,m] = V[k,m] * (1 - V[k,m]/np.sum(V[:,m], axis=0))
            B1[k,m] = B[k,m] * (1 - B[k,m]/np.sum(B[:,m], axis=0))
            # (11)
            U2[k,m] = (U[k,m] - np.min(U[:,m]))/(np.max(U[:,m]) - np.min(U[:,m]))
            V2[k,m] = (V[k,m] - np.min(V[:,m]))/(np.max(V[:,m]) - np.min(V[:,m]))
            B2[k,m] = (B[k,m] - np.min(B[:,m]))/(np.max(B[:,m]) - np.min(B[:,m]))
    
    # correction term to deal with all the same answer
    # only happens when C - Cmin = 0 since Cmax - Cmin = 0
    U2 = np.nan_to_num(U2)
    V2 = np.nan_to_num(V2)
    B2 = np.nan_to_num(B2)
    
    # 4.2 Intra-Criterion Weighted Linear Combination
    S1 = (U1 + V1 + B1)/3
    S2 = (U2 + V2 + B2)/3
    print(S1)
    # 4.3 Inter-Criterion Weighted Combination
    '''
        no indication of the proper way to mcalculate psi in the literature, except for the line: 
        These weights are assumed to sum to 1 so that the total score remains bounded between zero and one.
        so, setting them all equal to each other... 1/(topics*measures)
    '''
    psi = np.ones((topics, measures)) #/(3*measures*topics)
    S = S1[:,2]*(psi[:,0] * S1[:,0] + psi[:,1]*S1[:,1])  #(13)
    Psi = psi[:,0] * S2[:,0] + psi[:,1] * S2[:,1] + psi[:,2] * S2[:,2]
    
    # 4.4 The Final Topic Significance Score
    return S


In [2]:
# Data import on its own line
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
import re, nltk, gensim, spacy
from nltk.stem import WordNetLemmatizer
import pandas as pd


df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')


# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]



def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))



def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
count_vector = data_vectorized
print(count_vector.shape)



(11314, 9751)


In [3]:
from sklearn.decomposition import LatentDirichletAllocation


model = LatentDirichletAllocation(n_components=40, verbose=1, max_iter=10)
documents = model.fit_transform(count_vector)
components = model.components_



iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [42]:
TSR = TopicSignificanceRanking(count_vector, components, documents)
TSR

[[             nan   3.34591603e+01   5.74417149e-01]
 [             nan   5.37353056e+01   5.14957269e-01]
 [             nan   4.54890736e+01   5.43129126e-01]
 [             nan   5.74552398e+01   4.89390633e-01]
 [             nan   6.46428529e+01   4.37645344e-01]
 [             nan   9.63954683e+01   5.32409917e-01]
 [             nan   7.11378810e+01   5.48403930e-01]
 [             nan   3.11593148e+01   5.47476426e-01]
 [             nan   2.80381483e+01   5.34398087e-01]
 [             nan   5.34543186e+01   5.13578470e-01]
 [             nan   8.92701513e+01   4.83968107e-01]
 [             nan   1.52621659e+01   5.63477738e-01]
 [             nan   1.00465634e+02   4.89613990e-01]
 [             nan   9.45216811e+01   5.21055735e-01]
 [             nan   5.61296589e+01   4.97313581e-01]
 [             nan   1.14031439e+02   4.35888880e-01]
 [             nan   4.12287157e+01   5.35294365e-01]
 [             nan   1.32125860e+01   5.62851895e-01]
 [             nan   3.45133

array([ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan,  nan,  nan,  nan,  nan,  nan])

In [23]:
# Derived Quantities
topics = components.shape[0]  # number of topics
measures = 3
measure = ['KL','COR','COS']

# Distributions...
# W-Uniform is a junk topic in which all the terms of the dictionary are equally probable
W_Uniform = np.ones(components.shape)/count_vector.shape[1]
# As an aside, would probability that word appears in the entire corpus be useful?

# the vacuous semantic distribution (WVacuous), is deﬁned to be the empirical distribution of the sample set
W_Vacuous = components / components.sum(axis=1)[:, np.newaxis]

# the background topic (D-BGround) is found equally probable in all the documents.
D_BGround = np.ones(count_vector.shape)/count_vector.shape[0]

# Construct U, V, and B for each topic k
U = np.zeros((topics, measures))
V = np.zeros((topics, measures))
B = np.zeros((topics, measures))

In [44]:
W_Uniform = np.ones(components.shape)/count_vector.shape[1]
W_Uniform

array([[ 0.00010255,  0.00010255,  0.00010255, ...,  0.00010255,
         0.00010255,  0.00010255],
       [ 0.00010255,  0.00010255,  0.00010255, ...,  0.00010255,
         0.00010255,  0.00010255],
       [ 0.00010255,  0.00010255,  0.00010255, ...,  0.00010255,
         0.00010255,  0.00010255],
       ..., 
       [ 0.00010255,  0.00010255,  0.00010255, ...,  0.00010255,
         0.00010255,  0.00010255],
       [ 0.00010255,  0.00010255,  0.00010255, ...,  0.00010255,
         0.00010255,  0.00010255],
       [ 0.00010255,  0.00010255,  0.00010255, ...,  0.00010255,
         0.00010255,  0.00010255]])

In [None]:
for i, topic in enumerate(TSR >= np.max(TSR)/3):
    if topic == True:
        print()
        print('TOPIC {}'.format(i))
        print(TSR[i])
        for j, val in enumerate(components[i,:] >= np.max(components[i,:])/2):
            if val == True:
                print(vectorizer.get_feature_names()[j], components[i,j])


### Validity Testing
First, test multiple runs of same number of topics (dif max_iter) then on dif topic numbers, to see if topic numbers have a greater effect on the mean than the 'actual' significance of the topics.

In [None]:
import matplotlib.pyplot as plt

X = [1+a for a in range(8)]
Y = []
Ym = {}
for x in X:
    model = LatentDirichletAllocation(n_components=40, max_iter=x)
    documents = model.fit_transform(count_vector)
    components = model.components_
    TSR = TopicSignificanceRanking(count_vector, components, documents)
    Ym[x] = np.sort(TSR)
    print(np.mean(TSR), TSR)
    Y.append(np.mean(TSR))

plt.plot(X, Y)
plt.xlabel('max_iterations')
plt.ylabel('mean TSR')
plt.show()
print(Ym)

In [None]:
documents.shape