In [5]:
import requests

from math import log
from statistics import mean

# import numpy as np
import pandas as pd

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

import gensim

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ethanpotthoff/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ethanpotthoff/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ethanpotthoff/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### Import and Preprocess Data

Kaggle Dataset:
https://www.kaggle.com/datasets/abisheksudarshan/topic-modeling-for-research-articles?resource=download

In [6]:
df = pd.read_csv("project_data/Test.csv")

In [39]:
nltk_sw = nltk.corpus.stopwords.words('english')

def get_wordnet_tag(tag):
    tag_map = {
        "J": nltk.corpus.wordnet.ADJ,
        "N": nltk.corpus.wordnet.NOUN,
        "V": nltk.corpus.wordnet.VERB,
        "R": nltk.corpus.wordnet.ADV
    }
    return tag_map.get(tag[0].upper(), nltk.corpus.wordnet.NOUN)

def get_tokens(text):
    tokens = nltk.RegexpTokenizer("[\w']+").tokenize(text)
    tokens = nltk.pos_tag(tokens)
    tokens = [nltk.stem.WordNetLemmatizer().lemmatize(word, get_wordnet_tag(tag)) for word, tag in tokens]
    tokens = [word for word in tokens if word not in nltk_sw]
    return tokens

def get_corpus(docs):
    return docs.apply(get_tokens)

# docs must be a list of lists of words
def get_stopwords(corpus, tfidf=False):
    words = {}
    for i, doc in enumerate(corpus):
        for word in doc:
            words[word] = words.get(word, {})
            words[word][i] = (words[word].get(i, 0) + 1) if tfidf else 1

    for word in words:
        if tfidf:
            tf = sum(words[word].values())
            df = len(words[word].values())
            tfidf = tf / df
            words[word] = tfidf
        else:
            words[word] = len(words[word].values()) / len(corpus)
    
    s = pd.Series(words)
    return s[s > .50].sort_values()

def remove_corpus_stopwords(corpus):
    sw = get_stopwords(corpus)
    return corpus.apply(lambda tokens: [word for word in tokens if word not in sw])

In [40]:
docs = df["ABSTRACT"]
corpus = remove_corpus_stopwords(get_corpus(docs))
dictionary = gensim.corpora.Dictionary(corpus)
word_freq = [dictionary.doc2bow(word) for word in corpus]

In [41]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=word_freq,
                                            id2word=dictionary,
                                            num_topics=4,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto",
                                            per_word_topics=True)

[print(row[0], '\n', row[1], '\n') for row in lda_model.print_topics(num_words=20)]

0 
 0.021*"hamiltonians" + 0.014*"hamiltonian" + 0.013*"1" + 0.012*"n" + 0.010*"2" + 0.009*"k" + 0.008*"manifold" + 0.008*"equation" + 0.008*"result" + 0.007*"bound" + 0.006*"function" + 0.006*"problem" + 0.006*"show" + 0.006*"give" + 0.006*"solution" + 0.006*"p" + 0.006*"prove" + 0.005*"time" + 0.005*"case" + 0.005*"group" 

1 
 0.016*"method" + 0.014*"model" + 0.009*"data" + 0.008*"sample" + 0.008*"algorithm" + 0.008*"propose" + 0.008*"network" + 0.007*"use" + 0.006*"problem" + 0.006*"base" + 0.006*"show" + 0.005*"learn" + 0.005*"result" + 0.005*"paper" + 0.005*"help" + 0.005*"performance" + 0.004*"system" + 0.004*"carlo" + 0.004*"monte" + 0.004*"time" 

2 
 0.013*"0" + 0.011*"1" + 0.010*"mass" + 0.009*"2" + 0.009*"galaxy" + 0.008*"star" + 0.006*"5" + 0.006*"3" + 0.006*"find" + 0.006*"high" + 0.005*"10" + 0.005*"observation" + 0.005*"_" + 0.005*"present" + 0.005*"stellar" + 0.004*"gas" + 0.004*"low" + 0.004*"4" + 0.004*"model" + 0.004*"cluster" 

3 
 0.013*"magnetic" + 0.012*"couple"

[None, None, None, None]

In [36]:
import operator
def get_num_topics_per_word(lda_result, num_topics):
    word_counts = {}
    for topic in range(num_topics):
        tokens = nltk.RegexpTokenizer("[\w']+").tokenize(lda_result[topic][1])
        tokens = [x for x in tokens if not (x.isdigit() 
                                         or x[0] == '-' and x[1:].isdigit())]
        for token in tokens:
            if token in word_counts.keys():
                word_counts[token] += 1
            else:
                word_counts[token] = 1
                
    return(dict(sorted(word_counts.items(), key=operator.itemgetter(1),reverse=True)))

In [43]:
result = get_num_topics_per_word(lda_model.print_topics(num_words=40),4)
result

{'result': 4,
 'study': 3,
 'model': 3,
 'problem': 2,
 'show': 2,
 'time': 2,
 'x': 2,
 'also': 2,
 'paper': 2,
 'sample': 2,
 'use': 2,
 'help': 2,
 'system': 2,
 'two': 2,
 'high': 2,
 'present': 2,
 'state': 2,
 'hamiltonians': 1,
 'hamiltonian': 1,
 'n': 1,
 'k': 1,
 'manifold': 1,
 'equation': 1,
 'bound': 1,
 'function': 1,
 'give': 1,
 'solution': 1,
 'p': 1,
 'prove': 1,
 'case': 1,
 'group': 1,
 'space': 1,
 'generalized': 1,
 'riemann': 1,
 'number': 1,
 'metric': 1,
 'obtain': 1,
 'construction': 1,
 'non': 1,
 'g': 1,
 'point': 1,
 'q': 1,
 'r': 1,
 'u': 1,
 'modified': 1,
 'dimension': 1,
 'c': 1,
 'method': 1,
 'data': 1,
 'algorithm': 1,
 'propose': 1,
 'network': 1,
 'base': 1,
 'learn': 1,
 'performance': 1,
 'carlo': 1,
 'monte': 1,
 'test': 1,
 'idea': 1,
 'hmc': 1,
 'image': 1,
 'task': 1,
 'behind': 1,
 'achieve': 1,
 'provide': 1,
 'metropolis': 1,
 'computational': 1,
 'information': 1,
 'well': 1,
 'technique': 1,
 'neural': 1,
 'mass': 1,
 'galaxy': 1,
 'star'