In [1]:
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem.porter import PorterStemmer

# Allows me to speed up tagging without using pos_tag_sents
# as per https://stackoverflow.com/questions/33676526/pos-tagger-is-incredibly-slow
from nltk.tag.perceptron import PerceptronTagger

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from nltk.corpus import wordnet

import nltk

import sys
import os
import re
import math

import gensim
import gensim.corpora as corpora

NUM_CORES = 1

class LDAModelBuilder:
    _stemLemmaTool = None
    _stemDictionary = {}
    _tagger = PerceptronTagger()

    def __init__(self, numTopics, vectorModel, alpha, useToken, usePOS, useStemLemma, stopwordsFile, outputFile, verbose=True):
        self.verbose = verbose
        self._numTopics = numTopics
        self._vectorModel = vectorModel
        self._alpha = alpha
        self._useToken = useToken
        self._usePOS = usePOS
        self._useStemLemma = useStemLemma
        self.__initStopWords(stopwordsFile)
        self._outputFile = outputFile

    def __initStopWords(self, stopwordsFile):
        if stopwordsFile == None:
            if self.verbose:
                print("\tSelected 'none' for stopwords.")
            self._stopwords = set()
        elif stopwordsFile == 'nltk':
            if self.verbose:
                print("\tSelected NLTK stopwords.")
            self._stopwords = set(nltk_stopwords.words('english'))
        else:
            if self.verbose:
                print("\tReading custom stopwords from file.")
            self._stopwords = set(line.strip().lower()
                                  for line in open(stopwordsFile))

    def getStopwordSet(self):
        return self._stopwords

    def __stemOrLemmatizeDocument(self, document):
        # Stem or lemmatize document given program config
        if self._useStemLemma == 'N':
            return document
        elif self._useStemLemma == 'B':
            return self.__stemDocument(document)
        elif self._useStemLemma == 'L':
            return self.__lemmatizeDoc(document)
        else:
            print("Unsupported stem/lemmatization setting given in config file.")

    def __stemDocument(self, documentTokens):
        # Stem documents using PorterStemmer
        toStem = []
        # process document to remove parts of speech as specified, since lemmatization function will do this automatically
        partsSpeech = self._tagger.tag(documentTokens)
        for word, tag in partsSpeech:
            wntag = self.__getWordnetTag(tag)
            if self.__keepPartOfSpeech(wntag):
                toStem.append(word)

        # Pass items through stemmer, memoizing / referencing dictionary for performance
        toReturn = []
        if self._stemLemmaTool is None:
            self._stemLemmaTool = PorterStemmer()
        for word in toStem:
            if word not in self._stemDictionary:
                self._stemDictionary[word] = self._stemLemmaTool.stem(word)
            toReturn.append(self._stemDictionary[word])

        return toReturn

    def __lemmatizeDoc(self, documentTokens):
        # Lemmatize the document tokens using NLTK pos_tag
        toReturn = []
        if self._stemLemmaTool is None:
            self._stemLemmaTool = WordNetLemmatizer()
        partsSpeech = self._tagger.tag(documentTokens)

        for word, tag in partsSpeech:
            wntag = self.__getWordnetTag(tag)
            lemma = None
            if self.__keepPartOfSpeech(wntag):
                if wntag is None:
                    lemma = self._stemLemmaTool.lemmatize(word)
                else:
                    lemma = self._stemLemmaTool.lemmatize(word, pos=wntag)
                toReturn.append(lemma)
        return toReturn

    def __keepPartOfSpeech(self, pos):
        # Determine if the word should be kept given its part of speech and program config
        if self._usePOS == 'A':
            return True
        elif self._usePOS == 'F':
            return (pos == wordnet.NOUN or pos == wordnet.VERB or pos == wordnet.ADJ or pos == wordnet.ADV)
        elif self._usePOS == 'N':
            return (pos == wordnet.NOUN or pos == wordnet.ADJ)
        elif self._usePOS == 'n':
            return (pos == wordnet.NOUN)
        else:
            return False

    def __getWordnetTag(self, tag):
        # Convert to WordNet tags (from Penn)
        # Source for this method: https://stackoverflow.com/a/15590384
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    def __isStopWord(self, word):
        if word.lower() in self._stopwords:
            return True
        else:
            return False

    def __processWordToKeep(self, word):
        if self._useToken == 'A':
            # Keep all words except single character non-alphanumeric characters
            if len(word) == 1 and re.search(r'\W', word):
                # Case for single-character nonalphanumeric
                return None
            else:
                return word

        elif self._useToken == 'a':
            # Keep all words except single character non-alphanumeric characters,
            # remove symbols if token is a mixture of alphanumeric and symbols
            if len(word) == 1 and re.search(r'\W', word):
                # Case for single-character nonalphanumeric
                return None
            else:
                return re.sub(r'\W', '', word)

        elif self._useToken == 'N':
            # Keep only alphanumeric tokens
            if re.search(r'\W', word):
                # Case for non-alphanumeric
                return None
            else:
                # Valid case
                return word

        elif self._useToken == 'n':
            # Keep only alphanumeric tokens, removing tokens that are only numbers
            if re.search(r'\W', word):
                # Case for non-alphanumeric
                return None
            if not re.search(r'[a-zA-Z]', word):
                # Case for only numbers
                return None
            else:
                # Valid case
                return word

    def preProcessDocument(self, doc):
        return self.__preProcessDocument(word_tokenize(doc))

    def getBagOfWords(self, tokens):
        return self.id2word_.doc2bow(tokens)

    def __preProcessDocument(self, tokens):
        # Perform stemming or lemmatization
        firstPass = []
        for word in tokens:
            if len(word) < 3:
                continue
            word=word.lower()
            keepWord = self.__processWordToKeep(word)
            if keepWord is not None and not self.__isStopWord(keepWord):
                firstPass.append(keepWord)
        firstPass = self.__stemOrLemmatizeDocument(firstPass)
        # Now we remove stop words again
        toReturn = []
        for word in firstPass:
            if not self.__isStopWord(word):
                toReturn.append(word)

        return toReturn

    def __buildGensimCorpus(self, documents):
        if self.verbose:
            print("\tBuilding GenSim corpus...")
        # Build a GenSim corpus given documents
        processedDocuments = []
        for doc in documents:
            processedDocuments.append(self.preProcessDocument(doc))

        wordIDs = corpora.Dictionary(processedDocuments)
        corpus = [wordIDs.doc2bow(text) for text in processedDocuments]

        if self._vectorModel == 'B':
            # Use binary model
            for document in corpus:
                document[:] = [(id, 1 if freq > 0 else 0)
                               for (id, freq) in document]

        elif self._vectorModel == 'T':
            # Use TFIDF model
            tfidf = gensim.models.TfidfModel(corpus)
            corpus = tfidf[corpus]

        # Else 't' use TF model (no adjustment)
        elif not self._vectorModel == 't':
            print("Unsupported vector model passed in config file.")

        if self.verbose:
            print("\tBuilt GenSim corpus.")
        return (corpus, wordIDs)

    def __buildLDAModel(self, corpus, id2word):
        return gensim.models.ldamulticore.LdaMulticore(
            corpus=corpus,
            id2word=id2word,
            num_topics=self._numTopics,
            alpha=self._alpha,
            workers=NUM_CORES
        )

    def trainLDA(self, documents):
        trainSuccess = False
        if self.verbose:
            print("\tBuilding GenSim LDA topic model...")
        # Build / train LDA model via GenSim
        corpus, wordIDs = self.__buildGensimCorpus(documents)
        if len(wordIDs) > 0:
            self.LDAmodel_ = self.__buildLDAModel(corpus, wordIDs)
            self.id2word_ = wordIDs
            self._corpus_ = corpus
            trainSuccess = True
            if self.verbose:
                print("\tBuilt GenSim LDA topic model.")
        return trainSuccess
    
    def saveModel_(self):
        if self.verbose:
            print("\tSaving LDA topic model...")
        # Output the model, note I use SKLearn convention with pre/post-underscore to denote pre/post train functions
        self.LDAmodel_.save(self._outputFile + '.model')
        if self.verbose:
            print("\tSaved LDA topic model.")

    def loadModel(self, fromFile):
        self.LDAmodel_ = gensim.models.ldamulticore.LdaMulticore.load(fromFile)
            
    def getTopic(self, topicID, n=10):
        topic = self.LDAmodel_.get_topic_terms(topicid=topicID, topn=n)
        # Transform word IDs back to the original word
        topic[:] = [(self.id2word_[id], prob) for (id, prob) in topic]
        return topic

    def saveTopics_(self, n=10):
        if self.verbose:
            print("\tSaving LDA topics...")
        for topicID in range(0, self._numTopics):
            topic = self.getTopic(topicID=topicID, n=n)
            # Write topic output
            with open(self._outputFile + '_' + str(topicID) + '.topic', 'w') as writer:
                for (word, prob) in topic:
                    writer.write(word)
                    writer.write(' ')
                    writer.write(str(prob))
                    writer.write('\n')
        if self.verbose:
            print("\tSaved LDA topics.")

    def generateAndSaveDocTopics_(self, fileNames):
        if self.verbose:
            print("\tGenerating and saving document topics...")
        # Pass file names in as vector corresponding to original corpus documents
        with open(self._outputFile + '.dt', 'w') as writer:
            # Iterate over documents / filenames simultaneously
            for document, fileName in zip(self._corpus_, fileNames):
                writer.write(fileName)
                writer.write(' ')
                # Get topics for document
                docTopics = self.LDAmodel_.get_document_topics(
                    document, minimum_probability=0)
                # Write document topics to file
                for (topicID, prob) in docTopics:
                    writer.write(str(prob))
                    writer.write(' ')
                writer.write('\n')
        print("\tGenerated and saved document topics.")


def getJaccard(s, t):
    # Given two sets of words
    # Calculate the Jaccard coefficient | S ⋂ T | / | S ⋃ T |
    numer = len(s.intersection(t))
    denom = len(s.union(t))
    return numer / denom if denom > 0 else 0


def getTopicSim(t1, t2):
    # Given two topics, get their similarity as Jaccard of T1(k) and T2(k)
    # Format note: t1, t2 should be sets of tuple representing (topic, topic_prob)
    t1_words = set()
    t2_words = set()
    for ((t1_word, _), (t2_word, _)) in zip(t1, t2):
        t1_words.add(t1_word)
        t2_words.add(t2_word)
    sim = getJaccard(t1_words, t2_words)
    return sim


def getTopicSetSim(tprime, uprime):
    selectedUvals = set()  # Used to see if we get a perfect match
    simSum = 0
    for t in tprime:
        bestTopic = None
        bestSim = None
        counter = 0
        bestTopicIndex = 0
        for u in uprime:
            if bestTopic is None:
                # First index: assign base best topic, sim, index
                bestTopic = u
                bestTopicIndex = 0
                bestSim = getTopicSim(t, u)
                continue
            # Get similarity for current topic
            sim = getTopicSim(t, u)
            if sim > bestSim:
                bestSim = sim
                bestTopic = u
                bestTopicIndex = counter
            counter += 1
        selectedUvals.add(bestTopicIndex)  # Selected U at index (counter)
        simSum += bestSim
    if len(selectedUvals) == len(tprime):
        # Perfect match was found
        return (None, simSum)
    else:
        # Did not find a perfect match
        return (
            # First term: number of selected topics / number of topics in T
            len(selectedUvals) / len(tprime),
            simSum
        )


def getWordnetTag(tag):
    # Convert to WordNet tags (from Penn)
    # Source for this method: https://stackoverflow.com/a/15590384
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [2]:
cord19Path = './../scratch/CORD19Data/'
import pandas as pd
import numpy as np
import json
import os
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [4]:
metadata_path = cord19Path + 'metadata.csv'
metadata = pd.read_csv(metadata_path)
metadata.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350.0,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [5]:
'''
    @Desc    : Reads in json article and converts into Pandas Dataframe
    @Params  : filepath (str)
    @Returns : Pandas Dataframe 
'''
def JsonToDataFrame(filepath):
        
    #read json into dict
    with open(filepath) as json_data:
        data = json.load(json_data)
        
        paper_id = data['paper_id']
        abstract = '\n'.join([section['text'] for section in data['abstract']])

        

        final_data = {
            'paper_id'  : [data['paper_id']],
            'section'   : ['abstract'],
            'text'  : ['\n'.join([section['text'] for section in data['abstract']])]                                       
        }
        
        df = pd.DataFrame.from_dict(final_data)
        for section in data['body_text']:
            df = df.append({
                'paper_id' : data['paper_id'],
                'section'  : section['section'],
                'text'     : section['text']
            }, ignore_index=True)
            
        return df
    
        
biorxiv_medrxiv    = cord19Path + 'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/'
comm_use_subset    = cord19Path + 'comm_use_subset/comm_use_subset/pdf_json/'
noncomm_use_subset = cord19Path + 'noncomm_use_subset/noncomm_use_subset/pdf_json/'

biorxiv_medrxiv_files       = [biorxiv_medrxiv + pos_json for pos_json in os.listdir(biorxiv_medrxiv) if pos_json.endswith('.json')]
comm_use_subset_files       = [comm_use_subset + pos_json for pos_json in os.listdir(comm_use_subset) if pos_json.endswith('.json')]
noncomm_use_subset_files    = [noncomm_use_subset + pos_json for pos_json in os.listdir(noncomm_use_subset) if pos_json.endswith('.json')]

#initialize dfs
biomed_df      = pd.DataFrame()
comm_use_df    = pd.DataFrame()
noncomm_use_df = pd.DataFrame()

#read biomed data
for f in biorxiv_medrxiv_files:
    df = JsonToDataFrame(f)
    biomed_df = biomed_df.append(df, ignore_index=True)

#read commonly used data
for f in comm_use_subset_files:
    df = JsonToDataFrame(f)
    comm_use_df = comm_use_df.append(df, ignore_index=True)

#read non-commonly used data
for f in noncomm_use_subset_files:
    df = JsonToDataFrame(f)
    noncomm_use_df = noncomm_use_df.append(df, ignore_index=True)


full_corpus = pd.concat([biomed_df, comm_use_df, noncomm_use_df])

KeyboardInterrupt: 

In [None]:
full_corpus.head()

In [None]:
full_corpus['section'].value_counts()

In [None]:
import re
import string

punct_table = str.maketrans('', '', string.punctuation)

#remove punctuation
full_corpus['text'] = full_corpus['text'].map(lambda x: x.translate(punct_table))

#convert to lowercase
full_corpus['text'] = full_corpus['text'].map(lambda x: x.lower())

full_corpus.head()

In [None]:
len(full_corpus)  # number of documents is almost 500k

In [2]:
# Seek to find groups of topics that match this prompt

promptText = '''
What do we know about virus genetics, origin, and evolution? What do we know about the virus origin and management measures at the human-animal interface?
Specifically, we want to know what the literature reports about:
Real-time tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time.
Access to geographic and temporal diverse sample sets to understand geographic distribution and genomic differences, and determine whether there is more than one strain in circulation. Multi-lateral agreements such as the Nagoya Protocol could be leveraged.
Evidence that livestock could be infected (e.g., field surveillance, genetic sequencing, receptor binding) and serve as a reservoir after the epidemic appears to be over.
Evidence of whether farmers are infected, and whether farmers could have played a role in the origin.
Surveillance of mixed wildlife- livestock farms for SARS-CoV-2 and other coronaviruses in Southeast Asia.
Experimental infections to test host range for this pathogen.
Animal host(s) and any evidence of continued spill-over to humans
Socioeconomic and behavioral risk factors for this spill-over
Sustainable risk reduction strategies
'''

promptDocs = [promptText]

baseModelOutputPath = '../scratch/CORD19_Topic_Models/'

ldaParams = dict(
    numTopics = 15,           # 15 topics
    vectorModel = 't',        # term freq detection model
    alpha = 5,                # higher alpha for sharp topic detection
    useToken = 'n',           # strictest token filtering 
    usePOS = 'N',             # nouns or adv 
    useStemLemma = 'L',       # lemmatization
    stopwordsFile = 'nltk',   # LDA stopwords
    outputFile = baseModelOutputPath + 'prompt_base_model',
    verbose=False
)

topicsToMatch = LDAModelBuilder(
    **ldaParams
)

topicsToMatch.trainLDA(promptDocs)
topicsToMatch.saveTopics_()

k = 15 # number of words per topic
promptTopicSet = []
for i in range(0, ldaParams['numTopics']):
    topic = topicsToMatch.getTopic(topicID=i, n=k)
    print(topic)
    promptTopicSet.append(topic)

[('virus', 0.033842884), ('evidence', 0.030650547), ('geographic', 0.025779253), ('origin', 0.020722691), ('surveillance', 0.019986538), ('risk', 0.018991755), ('host', 0.018385258), ('livestock', 0.017767552), ('farmer', 0.01750036), ('southeast', 0.017277053), ('strategy', 0.017105252), ('access', 0.016847553), ('field', 0.01659914), ('information', 0.016258717), ('range', 0.016197411)]
[('virus', 0.030424263), ('evidence', 0.025495466), ('farmer', 0.024568291), ('risk', 0.022742197), ('host', 0.021322254), ('livestock', 0.020946043), ('geographic', 0.020675456), ('origin', 0.019096063), ('sample', 0.017763996), ('surveillance', 0.017684273), ('southeast', 0.017310897), ('sustainable', 0.01693573), ('dissemination', 0.016599562), ('genetic', 0.016481493), ('access', 0.016452314)]
[('virus', 0.034175847), ('evidence', 0.028878147), ('host', 0.023692703), ('origin', 0.02229092), ('risk', 0.021507619), ('livestock', 0.021420738), ('surveillance', 0.020662595), ('farmer', 0.02055464), ('

In [None]:
from tqdm import tqdm

# Now get topics for each document
ldaModels = dict()

numEmpty=0

minDocLength = 10

for i in range(len(full_corpus)):
    docText = [full_corpus['text'].iloc[i]]
    ldaParams['outputFile'] = baseModelOutputPath + '_model_doc_' + str(i)
    ldaModels[i] = LDAModelBuilder(**ldaParams)
    if ldaModels[i].trainLDA(docText):
        # Save if train successful
        ldaModels[i].saveTopics_()
    if i % 100 == 0:
        print(i)
        
print("Processed", i, "documents.")
print("Found", numEmpty,"empty.")

In [None]:
# multiprocessing for above code
from multiprocessing import Pool, Lock, Process
from multiprocessing.sharedctypes import Array


ldaModels = dict()

numEmpty=0

minDocLength = 10


def processLDA(lowerIndex, upperIndex, documents):
    for (doc, i) in zip(documents, range(lowerIndex, upperIndex)):
        docText = [doc]
        ldaParams['outputFile'] = baseModelOutputPath + '_model_doc_' + str(i)
        model = LDAModelBuilder(**ldaParams)
        if model.trainLDA(docText):
            model.saveTopics_()

            
            
if __name__ == '__main__':

    numCores = 36
    totalDocs = len(full_corpus)

    processes = []
    splits = np.array_split(full_corpus, numCores)
    
    lower = 0
    
    for i in range(numCores):
        # spawn process with docs
        docs = splits[i].text.tolist()
        p = Process(target=processLDA, args=(lower, lower+len(docs), docs))
        p.start()
        print("Started process", i)
        processes.append(p)

        lower += len(docs)

        
        
    for p in processes:
        p.join()
        print("Joined process.")
        
        

    print("Processed", i, "documents.")
    print("Found", numEmpty,"empty.")



In [30]:
# Read in calculated topic models
from tqdm import tqdm
import os.path
from os import path

import numpy

ldaDocumentTopics = numpy.empty(500000, dtype=object)
print("alloc array")

numFailed = 0

numTopics = 15

for i in range(500000):
    readFile = baseModelOutputPath + '_model_doc_' + str(i)
    
    ldaDocumentTopics[i] = list()
    
    for t in range(numTopics):
        topics = []
        #if path.exists(readFile + '_' + str(t) + '.topic'):
        try:
            with open(readFile + '_' + str(t) + '.topic', 'r') as topicFile:
                for topicLine in topicFile:
                    items = topicLine.split()
                    topicString = items[0]
                    topicWeight = float(items[1])
                    topics.append((topicString, topicWeight))
        except OSError:
            pass
        ldaDocumentTopics[i].append(topics)      
    
    if i % 100 == 0:
        print("read in ", i, "models.")

alloc array
read in  0 models.
read in  100 models.
read in  200 models.
read in  300 models.
read in  400 models.
read in  500 models.
read in  600 models.
read in  700 models.
read in  800 models.
read in  900 models.
read in  1000 models.
read in  1100 models.
read in  1200 models.
read in  1300 models.
read in  1400 models.
read in  1500 models.
read in  1600 models.
read in  1700 models.
read in  1800 models.
read in  1900 models.
read in  2000 models.
read in  2100 models.
read in  2200 models.
read in  2300 models.
read in  2400 models.
read in  2500 models.
read in  2600 models.
read in  2700 models.
read in  2800 models.
read in  2900 models.
read in  3000 models.
read in  3100 models.
read in  3200 models.
read in  3300 models.
read in  3400 models.
read in  3500 models.
read in  3600 models.
read in  3700 models.
read in  3800 models.
read in  3900 models.
read in  4000 models.
read in  4100 models.
read in  4200 models.
read in  4300 models.
read in  4400 models.
read in  4

KeyboardInterrupt: 

In [6]:
# Read saved topic models from disk
import pandas as pd
models = pd.read_csv('topic_models_mthread.csv')
models.head(50)

Unnamed: 0,doc_id,topic_id,word,weight
0,0,0,expert,0.048942
1,0,0,dynamic,0.046465
2,0,0,result,0.045898
3,0,0,license,0.044178
4,0,0,current,0.043589
5,0,0,preprint,0.041319
6,0,0,model,0.04043
7,0,0,authorfunder,0.031455
8,0,0,virus,0.031241
9,0,0,display,0.030412


In [9]:
models.describe()

Unnamed: 0,doc_id,topic_id,weight
count,66968550.0,66968550.0,66968550.0
mean,224879.1,7.0,0.04915377
std,129717.4,4.320494,0.04923616
min,0.0,0.0,0.0
25%,112618.0,3.0,0.02631349
50%,224823.0,7.0,0.03767874
75%,337108.0,11.0,0.05726257
max,449883.0,14.0,1.0


In [None]:
# Now find the n most similar documents
k = 30

def getTopicsForDocument(docID):
    topics = []
    for i in range(0, ldaParams['numTopics']):
        topics.append(ldaModels[docID].getTopic(topicID=i, n=k))
    return topics

for i in range(0, len(full_corpus)):
    docTopics = getTopicsForDocument(i)
    full_corpus['aggSimScore'] = getTopicSetSim(docTopics, promptTopicSet)
    if i % 1000 == 0:
        print(i)
        


In [None]:
# Sort by highest sim scores
full_corpus.sort_values('aggSimScore')

In [None]:
# new sim metric from
# Wang, Xi. (2019). Evaluating Similarity Metrics for Latent Twitter Topics. 

import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('./../scratch/GoogleNewsVectors.gz', binary=True)  

def we_basedSimScore(topicSet1, topicSet2):
    