### Topic Significance Ranking
Based on: [Topic Significance Ranking of LDA Generative
Models (Alsumait et al.)](https://mimno.infosci.cornell.edu/info6150/readings/ECML09_AlSumaitetal.pdf).

Data aquisition from [this website](https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#3importnewsgroupstextdata)


In [1]:
import numpy as np
from scipy.stats import entropy
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize


def TopicSignificanceRanking(count_vector, components, documents):
    '''
    This takes the topics from an LDA model (sklearn) and assigns TSR scores to it.
    
        count_vector: the count vector of the words used in the sklearn model.
    
        components:   the topics generated by the LDA model.
        
        RETURNS TSR for each topic
    
    It is highly probable that this could be done easier than below, and indeed, highly likely.  I was doing this as I read through the paper, and therefore was thinking in terms of matching the text, not in terms of efficiency, except where blindingly obvious.
    Darin LaSota, 5/30/2018
    '''
    # Derived Quantities
    topics = components.shape[0]  # number of topics
    measures = 3
    measure = ['KL','COR','COS']
    
    # Distributions...
    # is it a word in the corpus?
    uniform_distr = np.ones(components[0,:].shape)/count_vector.shape[1]
    # is it as common in the corpus as it is in the dataset?
    vacuous_distr = np.array(np.sum(count_vector, axis=0))[0]/np.mean(np.array(np.sum(count_vector, axis=0))[0])
    # is it as common in the dataset as any document?
    bground_distr = np.ones(count_vector[:,1].shape)/count_vector.shape[0]
    
    # Construct U, V, and B for each topic k
    U = np.zeros((topics, measures))
    V = np.zeros((topics, measures))
    B = np.zeros((topics, measures))
    for k in range(topics):
        # KL = 0
        U[k, 0] = entropy(components[k,:], uniform_distr)
        V[k, 0] = entropy(components[k,:], vacuous_distr)
        B[k, 0] = np.mean(entropy(np.array(documents[:,0]), bground_distr))
        # COR
        U[k, 1] = np.correlate(components[k,:], uniform_distr)
        V[k, 1] = np.correlate(components[k,:], vacuous_distr)
        B[k, 1] = np.mean(np.correlate(documents[:,k], bground_distr[:,0], mode='valid'))
        # COS
        U[k, 2] = cosine(components[k,:].reshape(-1,1), uniform_distr.reshape(-1,1))
        V[k, 2] = cosine(components[k,:].reshape(-1,1), vacuous_distr.reshape(-1,1))
        B[k, 2] = np.mean(cosine(documents[:,k].reshape(-1,1), bground_distr.reshape(-1,1)))
    
    # 4.1 Standardization Proceedure
    # (10) and (11)
    U1 = np.zeros((topics, measures))
    V1 = np.zeros((topics, measures))
    B1 = np.zeros((topics, measures))
    U2 = np.zeros((topics, measures))
    V2 = np.zeros((topics, measures))
    B2 = np.zeros((topics, measures))
    for k in range(topics):
        for m in range(measures):
            # (10)
            U1[k,m] = U[k,m] * (1 - U[k,m]/np.sum(U[:,m], axis=0))
            V1[k,m] = V[k,m] * (1 - V[k,m]/np.sum(V[:,m], axis=0))
            B1[k,m] = B[k,m] * (1 - B[k,m]/np.sum(B[:,m], axis=0))
            # (11)
            U2[k,m] = (U[k,m] - np.min(U[:,m]))/(np.max(U[:,m]) - np.min(U[:,m]))
            V2[k,m] = (V[k,m] - np.min(V[:,m]))/(np.max(V[:,m]) - np.min(V[:,m]))
            B2[k,m] = (B[k,m] - np.min(B[:,m]))/(np.max(B[:,m]) - np.min(B[:,m]))
    
    # correction term to deal with all the same answer
    # only happens when C - Cmin = 0 since Cmax - Cmin = 0
    U2 = np.nan_to_num(U2)
    V2 = np.nan_to_num(V2)
    B2 = np.nan_to_num(B2)
    
    # 4.2 Intra-Criterion Weighted Linear Combination
    S1 = (U1 + V1 + B1)/3
    S2 = (U2 + V2 + B2)/3

    # 4.3 Inter-Criterion Weighted Combination
    '''
        no indication of the proper way to mcalculate psi in the literature, except for the line: 
        These weights are assumed to sum to 1 so that the total score remains bounded between zero and one.
        so, setting them all equal to each other... 1/(topics*measures)
    '''
    psi = np.ones((topics, measures))/(3*measures*topics)
    S = S1[:,2]*(psi[:,0] * S1[:,0] + psi[:,1]*S1[:,1])  #(13)
    Psi = psi[:,0] * S2[:,0] + psi[:,1] * S2[:,1] + psi[:,2] * S2[:,2]
        
    # 4.4 The Final Topic Significance Score
    return Psi*S


In [32]:
import re, nltk, spacy, gensim

# Data import on its own line
import pandas as pd

df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())


# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

ImportError: No module named 'gensim'

In [3]:
from sklearn.decomposition import LatentDirichletAllocation

model = LatentDirichletAllocation(n_components=40, verbose=1, max_iter=10)
documents = model.fit_transform(count_vector)
components = model.components_



iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [4]:
TSR = TopicSignificanceRanking(count_vector, components, documents)
TSR



array([ 0.00196096,  0.00469532,  0.00567782,  0.0018698 ,  0.00224714,
        0.00194482,  0.00190373,  0.00177843,  0.00181902,  0.00235754,
        0.00177843,  0.00208114,  0.00226861,  0.00177843,  0.00217533,
        0.00482521,  0.00284637,  0.00227332,  0.00194099,  0.00179318,
        0.00244928,  0.00239068,  0.11279445,  0.00179696,  0.00272435,
        0.00177843,  0.00177843,  0.00177843,  0.00217827,  0.00177843,
        0.00177843,  0.00177843,  0.00179317,  0.00192145,  0.00239094,
        0.0023009 ,  0.00177843,  0.00177843,  0.00183746,  0.00183248])

(18846, 130107)

### Validity Testing
First, test multiple runs of same number of topics (dif max_iter) then on dif topic numbers, to see if topic numbers have a greater effect on the mean than the 'actual' significance of the topics.

In [13]:
import matplotlib.pyplot as plt

X = [1+a for a in range(8)]
Y = []
Ym = {}
for x in X:
    model = LatentDirichletAllocation(n_components=40, max_iter=x)
    documents = model.fit_transform(count_vector)
    components = model.components_
    TSR = TopicSignificanceRanking(count_vector, components, documents)
    Ym[x] = np.sort(TSR)
    print(np.max(TSR), np.mean(TSR))
    Y.append(np.mean(TSR))

plt.plot(X, Y)
plt.xlabel('max_iterations')
plt.ylabel('mean TSR')
plt.show()
print(Ym)



0.1137574447 0.00499861139469


KeyboardInterrupt: 

In [12]:
documents.shape

(18846, 40)