### Topic Significance Ranking
- Based on: [Topic Significance Ranking of LDA Generative Models (Alsumait et al.)](https://mimno.infosci.cornell.edu/info6150/readings/ECML09_AlSumaitetal.pdf).
- Data aquisition from [this website](https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#3importnewsgroupstextdata) verbatum, with prints removed.


In [1]:
import numpy as np
from scipy.stats import entropy
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize


def TopicSignificanceRanking(count_vector, components, documents):
    '''
    This takes the topics from an LDA model (sklearn) and assigns TSR scores to it.
    
        count_vector: the count vector of the words used in the sklearn model.
    
        components:   the topics generated by the LDA model.
        
        RETURNS TSR for each topic
    
    It is highly probable that this could be done easier than below, and indeed, highly likely.  I was doing this as I read through the paper, and therefore was thinking in terms of matching the text, not in terms of efficiency, except where blindingly obvious.
    Darin LaSota, 5/30/2018
    '''
    # Derived Quantities
    topics = components.shape[0]  # number of topics
    measures = 3
    measure = ['KL','COR','COS']
    
    # Distributions...
    # is it a word in the corpus?
    uniform_distr = np.ones(components[0,:].shape)/count_vector.shape[1]
    # is it as common in the corpus as it is in the dataset?
    vacuous_distr = np.array(np.sum(count_vector, axis=0))[0]/np.mean(np.array(np.sum(count_vector, axis=0))[0])
    # is it as common in the dataset as any document?
    bground_distr = np.ones(count_vector[:,1].shape)/count_vector.shape[0]
    
    # Construct U, V, and B for each topic k
    U = np.zeros((topics, measures))
    V = np.zeros((topics, measures))
    B = np.zeros((topics, measures))
    for k in range(topics):
        # KL = 0
        U[k, 0] = entropy(components[k,:], uniform_distr)
        V[k, 0] = entropy(components[k,:], vacuous_distr)
        B[k, 0] = np.mean(entropy(np.array(documents[:,0]), bground_distr))
        # COR
        U[k, 1] = np.correlate(components[k,:], uniform_distr)
        V[k, 1] = np.correlate(components[k,:], vacuous_distr)
        B[k, 1] = np.mean(np.correlate(documents[:,k], bground_distr[:,0], mode='valid'))
        # COS
        U[k, 2] = cosine(components[k,:].reshape(-1,1), uniform_distr.reshape(-1,1))
        V[k, 2] = cosine(components[k,:].reshape(-1,1), vacuous_distr.reshape(-1,1))
        B[k, 2] = np.mean(cosine(documents[:,k].reshape(-1,1), bground_distr.reshape(-1,1)))
    
    # 4.1 Standardization Proceedure
    # (10) and (11)
    U1 = np.zeros((topics, measures))
    V1 = np.zeros((topics, measures))
    B1 = np.zeros((topics, measures))
    U2 = np.zeros((topics, measures))
    V2 = np.zeros((topics, measures))
    B2 = np.zeros((topics, measures))
    for k in range(topics):
        for m in range(measures):
            # (10)
            U1[k,m] = U[k,m] * (1 - U[k,m]/np.sum(U[:,m], axis=0))
            V1[k,m] = V[k,m] * (1 - V[k,m]/np.sum(V[:,m], axis=0))
            B1[k,m] = B[k,m] * (1 - B[k,m]/np.sum(B[:,m], axis=0))
            # (11)
            U2[k,m] = (U[k,m] - np.min(U[:,m]))/(np.max(U[:,m]) - np.min(U[:,m]))
            V2[k,m] = (V[k,m] - np.min(V[:,m]))/(np.max(V[:,m]) - np.min(V[:,m]))
            B2[k,m] = (B[k,m] - np.min(B[:,m]))/(np.max(B[:,m]) - np.min(B[:,m]))
    
    # correction term to deal with all the same answer
    # only happens when C - Cmin = 0 since Cmax - Cmin = 0
    U2 = np.nan_to_num(U2)
    V2 = np.nan_to_num(V2)
    B2 = np.nan_to_num(B2)
    
    # 4.2 Intra-Criterion Weighted Linear Combination
    S1 = (U1 + V1 + B1)/3
    S2 = (U2 + V2 + B2)/3

    # 4.3 Inter-Criterion Weighted Combination
    '''
        no indication of the proper way to mcalculate psi in the literature, except for the line: 
        These weights are assumed to sum to 1 so that the total score remains bounded between zero and one.
        so, setting them all equal to each other... 1/(topics*measures)
    '''
    psi = np.ones((topics, measures))/(3*measures*topics)
    S = S1[:,2]*(psi[:,0] * S1[:,0] + psi[:,1]*S1[:,1])  #(13)
    Psi = psi[:,0] * S2[:,0] + psi[:,1] * S2[:,1] + psi[:,2] * S2[:,2]
        
    # 4.4 The Final Topic Significance Score
    return Psi*S


In [3]:
# Data import on its own line
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
import re, nltk, gensim, spacy
from nltk.stem import WordNetLemmatizer
import pandas as pd


df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')


# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]



def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))



def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
count_vector = data_vectorized
print(count_vector.shape)

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']
ding!


In [59]:
from sklearn.decomposition import LatentDirichletAllocation


model = LatentDirichletAllocation(n_components=40, verbose=1, max_iter=10)
documents = model.fit_transform(count_vector)
components = model.components_



iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [60]:
TSR = TopicSignificanceRanking(count_vector, components, documents)
TSR

array([  2.63933507e-01,   1.33533406e-02,   6.10983757e-01,
         7.02936283e-01,   2.46915124e-01,   6.70662010e-01,
         1.09142404e-01,   4.68498627e-01,   5.28903571e-01,
         5.21815685e-01,   7.51324916e-01,   1.91439730e+00,
         5.11066012e-01,   2.34178734e-01,   3.54678009e-02,
         3.63118915e-02,   1.30741900e-01,   1.74564463e-02,
         1.25402261e-01,   8.12203702e-01,   9.53552281e-01,
         7.82635785e-02,   7.39812345e-03,   3.57561425e-02,
         3.23946929e-01,   6.05679727e-02,   1.67509197e-01,
         4.70191340e+00,   8.37923752e-03,   6.51108086e-02,
         1.26798709e+00,   8.06324794e-01,   9.69394624e-02,
         2.68846611e-01,   6.52305982e-03,   7.61508645e-03,
         2.42361176e-01,   2.34039220e+00,   1.43727585e-01,
         1.70499331e-03])

In [69]:
for i, topic in enumerate(TSR >= np.max(TSR)/3):
    if topic == True:
        print()
        print('TOPIC {}'.format(i))
        print(TSR[i])
        for j, val in enumerate(components[i,:] >= np.max(components[i,:])/2):
            if val == True:
                print(vectorizer.get_feature_names()[j], components[i,j])
    


TOPIC 11
1.91439730422
article 1769.06590181
good 1999.86240061
just 2259.86510284
know 2005.70989528
line 1438.34885061
make 1617.74722005
organization 1388.61242396
people 1282.41939792
problem 1245.40571663
say 1358.32480497
subject 1403.78671548
thing 1293.70938044
think 1715.40303975
time 1573.80259998
work 1264.17531147
write 2077.16930039

TOPIC 27
4.70191340046
article 3041.74114248
host 4085.2561238
line 5910.28352645
nntp 4058.46052453
organization 5588.77565848
post 5132.15069163
subject 5569.37318335
university 2976.03924753
write 3302.90625656

TOPIC 37
2.34039220363
believe 1598.32551594
christian 1626.79884448
god 2649.40885783
know 1708.16919911
people 2104.7721552
say 2942.33032433
think 1767.30105075


### Validity Testing
First, test multiple runs of same number of topics (dif max_iter) then on dif topic numbers, to see if topic numbers have a greater effect on the mean than the 'actual' significance of the topics.

In [None]:
import matplotlib.pyplot as plt

X = [1+a for a in range(8)]
Y = []
Ym = {}
for x in X:
    model = LatentDirichletAllocation(n_components=40, max_iter=x)
    documents = model.fit_transform(count_vector)
    components = model.components_
    TSR = TopicSignificanceRanking(count_vector, components, documents)
    Ym[x] = np.sort(TSR)
    print(np.mean(TSR), TSR)
    Y.append(np.mean(TSR))

plt.plot(X, Y)
plt.xlabel('max_iterations')
plt.ylabel('mean TSR')
plt.show()
print(Ym)



In [None]:
documents.shape