In [1]:
import requests

from math import log
from statistics import mean

import numpy as np
import pandas as pd
import statistics
import itertools

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

import gensim

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andrewbrkich/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/andrewbrkich/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andrewbrkich/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Import and Preprocess Data

Kaggle Dataset:
https://www.kaggle.com/datasets/abisheksudarshan/topic-modeling-for-research-articles?resource=download

In [2]:
df = pd.read_csv("project_data/Test.csv")

In [3]:
nltk_sw = nltk.corpus.stopwords.words('english')

def get_wordnet_tag(tag):
    tag_map = {
        "J": nltk.corpus.wordnet.ADJ,
        "N": nltk.corpus.wordnet.NOUN,
        "V": nltk.corpus.wordnet.VERB,
        "R": nltk.corpus.wordnet.ADV
    }
    return tag_map.get(tag[0].upper(), nltk.corpus.wordnet.NOUN)

def get_tokens(text):
    tokens = nltk.RegexpTokenizer("[\w']+").tokenize(text)
    tokens = nltk.pos_tag(tokens)
    tokens = [nltk.stem.WordNetLemmatizer().lemmatize(word, get_wordnet_tag(tag)) for word, tag in tokens]
    tokens = [word for word in tokens if word not in nltk_sw]
    return tokens

def get_corpus(docs):
    return docs.apply(get_tokens)

# docs must be a list of lists of words
def get_stopwords(corpus, tfidf=False):
    words = {}
    for i, doc in enumerate(corpus):
        for word in doc:
            words[word] = words.get(word, {})
            words[word][i] = (words[word].get(i, 0) + 1) if tfidf else 1

    for word in words:
        if tfidf:
            tf = sum(words[word].values())
            df = len(words[word].values())
            tfidf = tf / df
            words[word] = tfidf
        else:
            words[word] = len(words[word].values()) / len(corpus)
    
    s = pd.Series(words)
    return s[s > .50].sort_values()

def remove_corpus_stopwords(corpus):
    sw = get_stopwords(corpus)
    return corpus.apply(lambda tokens: [word for word in tokens if word not in sw])

In [4]:
docs = df["ABSTRACT"]
corpus = remove_corpus_stopwords(get_corpus(docs))
dictionary = gensim.corpora.Dictionary(corpus)
word_freq = [dictionary.doc2bow(word) for word in corpus]

In [5]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=word_freq,
                                            id2word=dictionary,
                                            num_topics=4,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto",
                                            per_word_topics=True)

[print(row[0], '\n', row[1], '\n') for row in lda_model.print_topics(num_words=20)]

0 
 0.021*"hamiltonians" + 0.014*"hamiltonian" + 0.013*"1" + 0.012*"n" + 0.010*"2" + 0.009*"k" + 0.008*"manifold" + 0.008*"equation" + 0.008*"result" + 0.007*"bound" + 0.006*"function" + 0.006*"problem" + 0.006*"show" + 0.006*"give" + 0.006*"solution" + 0.006*"p" + 0.006*"prove" + 0.005*"time" + 0.005*"case" + 0.005*"group" 

1 
 0.016*"method" + 0.014*"model" + 0.009*"data" + 0.008*"sample" + 0.008*"algorithm" + 0.008*"propose" + 0.008*"network" + 0.007*"use" + 0.006*"problem" + 0.006*"base" + 0.006*"show" + 0.005*"learn" + 0.005*"result" + 0.005*"paper" + 0.005*"help" + 0.005*"performance" + 0.004*"system" + 0.004*"carlo" + 0.004*"monte" + 0.004*"time" 

2 
 0.013*"0" + 0.011*"1" + 0.010*"mass" + 0.009*"2" + 0.009*"galaxy" + 0.008*"star" + 0.006*"5" + 0.006*"3" + 0.006*"find" + 0.006*"high" + 0.005*"10" + 0.005*"observation" + 0.005*"_" + 0.005*"present" + 0.005*"stellar" + 0.004*"gas" + 0.004*"low" + 0.004*"4" + 0.004*"model" + 0.004*"cluster" 

3 
 0.013*"magnetic" + 0.012*"couple"

[None, None, None, None]

In [6]:
# helper function to convert topic word probabilities tuple into a dictionary for easy indexing
def tuples_to_dict(list_of_tuples):
    result = dict()
    for key,value in list_of_tuples:
        result[key] = value
    return result

In [7]:
def topic_word_statistics(lda_topic_results):
    # convert topic word probs from lda_model.show_topics() to list of dictionaries
    topic_word_probs = []
    for topic in lda_topic_results: topic_word_probs.append(tuples_to_dict(topic[1]))
    
    # get list of unique words
    all_words = []
    for topic in topic_word_probs:
        for key in topic.keys(): all_words.append(key)
    all_words = (np.unique(all_words)).tolist()
    
    # create dataframe to hold statistics
    df = pd.DataFrame(columns=['mean', 'stdev', 'num_topics'], index=all_words)
    
    for word in all_words:
        vals = []
        for topic_dict in topic_word_probs:
            if word in topic_dict.keys(): vals.append(float(topic_dict[word]))
            else: vals.append(float(0))
        df.loc[word] = pd.Series({'mean': statistics.mean(vals), 'stdev': statistics.stdev(vals), 'num_topics': np.count_nonzero(vals)})
    return df.sort_values(by=['num_topics'], ascending=False)

In [8]:
# not sure if mean and stdev are being calculated how he wants - currently I use the probabilities from the lda result to calculate their mean & stdev
topic_word_statistics(lda_model.show_topics(num_words=25, formatted=False))

Unnamed: 0,mean,stdev,num_topics
result,0.004872,0.003421,3.0
time,0.002294,0.002686,2.0
model,0.004539,0.006603,2.0
problem,0.003108,0.003589,2.0
show,0.002984,0.003452,2.0
...,...,...,...
give,0.001534,0.003068,1.0
generalized,0.001187,0.002375,1.0
gas,0.001086,0.002171,1.0
galaxy,0.002145,0.004291,1.0


In [None]:
# insert code for removing values under a certain mean/stdev threshold

In [9]:
# Method 1 - topic cohesiveness using WordNet path_similarity

def method_1(lda_topic_results):
    result = {}
    words_without_senses = []
    
    # convert topic word probs from lda_model.show_topics() to list of dictionaries
    topic_word_probs = []
    for topic in lda_topic_results: topic_word_probs.append(tuples_to_dict(topic[1]))
    
    # calculate cohesiveness score for each topic
    for topic in range(len(topic_word_probs)):
        min_senses = []
        topic_words = topic_word_probs[topic].keys()
        
        # for each pair of words, calculate path similarity of each combination of senses for each word
        word_combinations = itertools.combinations(topic_words, 2)
        for word_pair in word_combinations:
            sense_similarities = []
            word1_senses = nltk.corpus.wordnet.synsets(str(word_pair[0]))
            word2_senses = nltk.corpus.wordnet.synsets(str(word_pair[1]))
            
            if(len(word1_senses) == 0): words_without_senses.append(word_pair[0])
            if(len(word2_senses) == 0): words_without_senses.append(word_pair[1])

            for syn1 in word1_senses:
                for syn2 in word2_senses:
                    sense_similarities.append(syn1.path_similarity(syn2))
            if len(sense_similarities) > 0: min_senses.append(min(sense_similarities))
        result["Topic " + str(topic)] = statistics.mean(min_senses)
    print("Topic words omitted because no WordNet sense was found:")
    print(np.unique(words_without_senses).tolist())
    return result
        
        

In [10]:
method_1(lda_model.show_topics(num_words=25, formatted=False))

Topic words omitted because no WordNet sense was found:
['_', 'carlo', 'crystallographic', 'hamiltonian', 'hamiltonians', 'hmc', 'magnetocrystalline', 'skx']


{'Topic 0': 0.06164714244613044,
 'Topic 1': 0.06368722392688668,
 'Topic 2': 0.06754333609100402,
 'Topic 3': 0.07356070916606322}

In [112]:
#method 2
#things I havn't done: combination/permutations of all possible senses (i just picked the first of each word)

def method_2(lda_topic_results):
    
    # convert topic word probs from lda_model.show_topics() to list of dictionaries
    topic_word_probs = []
    for topic in lda_topic_results: topic_word_probs.append(tuples_to_dict(topic[1]))

    for topic in range(len(topic_word_probs)):
        print("topic: ", topic)
        topic_words = topic_word_probs[topic].keys()
        topic_vals = topic_word_probs[topic].values()
        word_senses = []

        #get all the senses of each word
        for word in topic_words:
            senses = nltk.corpus.wordnet.synsets(str(word))
            word_senses.append(senses)
        # print(word_senses)

        while len(word_senses) != 1:
            #find least common ancestor for at least 1 sense of all words 
            #may need combinations/permutations library here to pick synsets randomly/all possibilities
            words_to_look_up = []
            for synsets in word_senses:
                try:
                    words_to_look_up.append(synsets[0])
                except:
                    #this word has no sysnsets
                    x=1
            print("Words to look up: ", words_to_look_up)

            #find least common hypernym for words_to_look_up
            #may need combonations/permutations here too
            ancestors = []
            for i in range(len(words_to_look_up) -1):
                ancestors.append(words_to_look_up[i].lowest_common_hypernyms(words_to_look_up[i+1]))

            print("Ancestors: ", ancestors)
            #clean up ancestors, make only unique entries remain and get rid of blanks
            unique_ancestors = []
            for item in ancestors:
                if item not in unique_ancestors:
                    unique_ancestors.append(item)
            res = list(filter(None, unique_ancestors))
            print("Filtered list to use next iteration: ", res)

            #set senses = filtered ancestors, and continue until only 1 item remains
            word_senses = res


        

In [113]:
method_2(lda_model.show_topics(num_words=25, formatted=False))

topic:  0
Words to look up:  [Synset('one.n.01'), Synset('nitrogen.n.01'), Synset('two.n.01'), Synset('kelvin.n.01'), Synset('manifold.n.01'), Synset('equation.n.01'), Synset('consequence.n.01'), Synset('boundary.n.02'), Synset('function.n.01'), Synset('problem.n.01'), Synset('show.n.01'), Synset('give.n.01'), Synset('solution.n.01'), Synset('phosphorus.n.01'), Synset('prove.v.01'), Synset('time.n.01'), Synset('case.n.01'), Synset('group.n.01'), Synset('space.n.01'), Synset('generalize.v.01'), Synset('riemann.n.01'), Synset('number.n.01'), Synset('ten.n.01')]
Ancestors:  [[Synset('abstraction.n.06')], [Synset('abstraction.n.06')], [Synset('definite_quantity.n.01')], [Synset('entity.n.01')], [Synset('entity.n.01')], [Synset('entity.n.01')], [Synset('entity.n.01')], [Synset('abstraction.n.06')], [Synset('abstraction.n.06')], [Synset('abstraction.n.06')], [Synset('abstraction.n.06')], [Synset('abstraction.n.06')], [Synset('substance.n.01')], [], [], [Synset('case.n.01')], [Synset('abstrac