In [63]:
import os
import pickle
import re
import numpy as np
import pandas as pd
import src.resources.utils.utils as ut

In [64]:
global out_dir_root
global data_root_dir
global top_nterms_from_topic
global section_name

In [72]:
# Specs
# Define where to load metrics
data_root_dir = ut.getPath('ROOT')
res_root_dir = os.path.join(ut.get_project_root(), 'out')

# Now only support LDA1/LDA2
evaluate_metrics = ['PERPLEXITY', 'COHERENCE', 'INTERCORPUSRATIO', 'AGGREGATED']
evaluate_ldas = ['LDA1', 'LDA2', 'LDA3']
top_nterms_from_topic = 20
section_name = 'am-i-infected-what-do-i-do'

f_data_dict_name = ut.getPath('CORPUS_NAME')

In [65]:
def loadDataDict():
    f_data_dict = os.path.join(*[data_root_dir, 'post-processed-data', f_data_dict_name])
    with open(f_data_dict, 'rb') as pkl:
        data_dict = pickle.load(pkl)
    return data_dict

In [66]:
def loadCorpus(lda_type, f_model):
    # TODO: Add support for LDA3
    if not os.path.isfile(f_model):
        # log.error('Fail to find document file %s', f_model)
        raise FileNotFoundError('Fail to find document file %s' % f_model)

    if lda_type == 'LDA1':
        f = os.path.join(*[data_root_dir, 'documents', lda, '{}_documents.pkl'.format(lda)])
        with open(f, 'rb') as handle:
            return pickle.load(handle)
    else:
        f_temp = os.path.join(*[data_root_dir, 'documents', lda, '{}_{}_documents.pkl'])
        corpus = []
        documents = []
        dictionaries = []
        # Load documents for both log and text
        for f in map(lambda x: f_temp.format(lda, x), ('log', 'text')):
            with open(f, 'rb') as handle:
                f_documents, f_corpus, f_dictionary = pickle.load(handle)
                corpus.append(f_corpus)
                documents.append(f_documents)
                dictionaries.append(f_dictionary)
        # Update documents by merging log/text documents
        documents = [x + y for x, y in zip(documents[0], documents[1])]
        text_corpus = corpus.pop(1)
        # Assign log_corpus to corpus
        corpus = corpus[0]
        log_num_terms = len(dictionaries[0])
        for idx, text_corpus_row in enumerate(text_corpus):
            corpus[idx] += [(tp[0] + log_num_terms, tp[1]) for tp in text_corpus_row]
        # Do not need dictionary, it can be captured from model.id2word
        return documents, corpus, None

In [67]:
def loadModel(f_model, corpus):
    """
    Load lda model
    Parameters
    ----------
    f_model

    Returns
    -------

    """
    with open(f_model, 'rb') as pkl:
        ldamodel = pickle.load(pkl)
    topic_per_doc = {}
    tmp_doc_prob = {}
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                # if top_nterms_from_topic:
                wp = ldamodel.show_topic(topic_num, topn=top_nterms_from_topic)
                # else:
                #     wp = ldamodel.show_topic(topic_num)
                top_topic_keywords = [word for word, prop in wp]
                topic_per_doc[i] = {'Topic': topic_num,
                                    'TopTermsRough': top_topic_keywords,
                                    'TopTerms': [re.sub(';TOPIC:\d+', '', x.replace('TEXT:', '').replace('LOG:', ''))
                                                 for x in top_topic_keywords],
                                    'Theta': prop_topic}
                tmp_doc_prob[i] = prop_topic
                break
            else:
                break
    # Sort by topic-document matching probability
    topic_per_doc = dict(sorted(topic_per_doc.items(), key=lambda tup: tup[1]['Theta'], reverse=True))

    return topic_per_doc

In [68]:
def load_topic_from_txt(postIdMathching, top_num_terms=10):
    """
    Load top terms per topic for all documents
    Returns
    -------

    """
    tmp_out = os.path.join(*[res_root_dir, metric, 'model', lda])
    f_doc_topics = os.path.join(tmp_out, 'doctopics.txt')
    f_topic_keys = os.path.join(tmp_out, 'topickeys.txt')
    d_html = os.path.join(*[data_root_dir,
                            'raw-data',
                            'BleepingComputerHtmlFiles',
                            section_name])
    lst_topic_terms = []

    # Read top terms per topic
    # Noted some topic has less than 10 terms (default top N words from mallet output is 20)
    # mt_topic_keys = np.loadtxt(f_topic_keys, dtype="str", usecols=range(2, 2 + topN))
    with open(f_topic_keys, 'r') as fr:
        for idx, line in enumerate(fr):
            if line.strip() == '': continue
            words = line.strip('\n').split()[2:]
            top_terms = words[:min(top_num_terms, len(words))]
            #terms = ['{term};TOPIC:{topicID}'.format(topicID=str(idx), term=x) for x in top_terms]

            lst_topic_terms.append({'Topic': idx,
                                    'TopTermsRough': top_terms,
                                    'TopTerms': [re.sub(';TOPIC:\d+', '', x.replace('TEXT:', '').replace('LOG:', '')) for x in top_terms]
                                    })

    num_topics = len(lst_topic_terms)
    # Load document-topic
    mt_doc_topics = np.loadtxt(f_doc_topics, dtype="float", usecols=range(2, 2 + num_topics))
    max_theta_indexes = np.argmax(mt_doc_topics, axis=1)

    res = []
    for doc_id, topic_id in enumerate(max_theta_indexes):
        topic_dict = lst_topic_terms[topic_id]
        post_id = postIdMathching[doc_id]
        # f_htmls = [
        #     x for x in os.listdir(d_html) if x.startswith('tid-link-%d' % post_id)
        # ]
        # TODO: Collect from remote server

        theta = mt_doc_topics[doc_id, topic_id]
        res.append({
            **topic_dict, **{
                'PostId': post_id,
                'Theta': theta
            }
        })
    res = sorted(res, key=lambda x: x['Theta'], reverse=True)
    return pd.DataFrame(res)

In [69]:
def load_topic_from_model(postIdMathching):
    """
    Load top terms per topic by stored model
    This function is slower so use another load topic function
    Returns
    -------

    """
    f_model = os.path.join(*[res_root_dir, metric, 'model', lda, 'mallet_model.pkl'])
    # Check file
    if not os.path.isfile(f_model):
        print('Fail to find file %s, continue' % f_model)
        return
    # Load model
    documents, corpus, _ = loadCorpus(lda_type=lda, f_model=f_model)
    topic_per_doc = loadModel(f_model, corpus)

    topic_per_doc_list = []
    for docId, topic_info in topic_per_doc.items():
        topic_info['PostId'] = postIdMathching[docId]
        topic_per_doc_list.append(topic_info)

    f_output = os.path.join(*[data_root_dir, 'result', 'matching',
                                'top' + str(top_nterms_from_topic) if top_nterms_from_topic else 'all',
                          'top_terms_per_doc_to_html_%s_%s.csv' % (lda, metric)])
    if not os.path.isdir(os.path.dirname(f_output)): os.makedirs(os.path.dirname(f_output))

    # df = pd.DataFrame(topic_per_doc).T
    # df.to_csv(f_output, index_label='PostId')
    # df['PostId'] = df.index
    # df.to_csv(f_output, index=False)

    df = pd.DataFrame(topic_per_doc_list)
    df.to_csv(f_output, index=False)
    print('Top terms per doc are written into %s' % f_output)

    # out_dir = os.path.join(*[res_root_dir, 'HTMLs', lda])

    # if isShowHTML:
    #     highlightTopicTokensByDocs(data_dict, topic_per_doc, ntop=top_nterms_from_topic)
    #
    # # postIdToDocTerms is the third item of data_dict which stores the mapping
    # for section in data_dict.keys():
    #     postIds = data_dict[section][2].keys()
    #     saveToMd(documents, postIds, topic_per_doc, 10, isShowHTML)

In [None]:
data_dict = loadDataDict()
postIdMathching = [post_id for post_id in data_dict[section_name][0].keys()]
isShowHTML = False
for metric in evaluate_metrics:
    for lda in evaluate_ldas:
        print('Processing {}'.format(lda))
        out_dir_root = os.path.join(*[res_root_dir, metric, '{}', lda])
        if top_nterms_from_topic:
            # If not None
            if top_nterms_from_topic <= 20:
                try:
                    df_topic_top_terms = load_topic_from_txt(postIdMathching=postIdMathching, top_num_terms=top_nterms_from_topic)
                    f_output = os.path.join(*[data_root_dir, 'result', 'matching',
                                       'top_terms_per_doc_to_html_%s_%s.csv' % (lda, metric)])
                    if not os.path.isdir(os.path.dirname(f_output)): os.makedirs(os.path.dirname(f_output))
                    df_topic_top_terms.to_csv(f_output, index=False)
                    print('Writing to %s' % f_output)
                except FileNotFoundError as e:
                    print('Fail to load from TXT. Now load from model.')
                    load_topic_from_model(postIdMathching)
            else:
                print('Cannot load more than top 20 words from TXT. Now load from model.')
                load_topic_from_model(postIdMathching)
        else:
            print('Number of top terms not specified. Will load all terms per topic from model.')
            load_topic_from_model(postIdMathching)
print('Finished')
