# Training a Unigram `LdaMulticore` model

## Importing modules &amp; defining globals


In [1]:
import os
import pickle
import numpy as np
import pandas as pd

from ast import literal_eval
from gensim.models import LdaMulticore, CoherenceModel

cwd = os.getcwd()
os.chdir("..")
from lda_utils import get_top_n_topics

os.chdir(cwd)

RAW_DATA_PATH = "../data/raw"
PKL_DATA_PATH = "../data/pickles"


## Loading preprocessed dataset


In [2]:
corpus = pd.read_csv(
    os.path.join(RAW_DATA_PATH, "corpus_clean.csv"), index_col=["ArticleID"]
)
corpus = corpus["document"].apply(literal_eval)
corpus.head()


ArticleID
34153941    [stable, coordination, variability, overground...
34153942    [weak, hip, strength, increase, dynamic, knee,...
34153964    [current, future, projection, amyotrophic, lat...
34153968    [disparity, asian, non, asian, thrombolyzed, a...
34153978    [maternal, factor, predict, loss, follow, newb...
Name: document, dtype: object

## Loading hyperparameters


In [3]:
h_params = np.load(os.path.join(RAW_DATA_PATH, "hyperparameters.npy"))
k = int(h_params[0])
a = h_params[1]
k, a


(7, 0.56)

## Load arguments required by the model

The arguments are dictionary of id/word, and the bow corpus


In [4]:
params = {}
for _, _, filenames in os.walk(PKL_DATA_PATH):
    for fname in filenames:
        if not fname.endswith(".pkl"):
            continue
        path = os.path.join(PKL_DATA_PATH, fname)
        file_ref = open(path, "rb")
        params[fname.replace(".pkl", "")] = pickle.load(file_ref)
        file_ref.close()


## Building a unigram model


In [5]:
lda_model = LdaMulticore(
    params["bow"],
    num_topics=k,
    id2word=params["id2word"],
    eta="auto",
    alpha=a,
    random_state=42,
)


## Assigning topics

now we use the model to assign topics to records/docs/articles


In [6]:
n = 3

df_topics = get_top_n_topics(params["bow"], lda_model, n)
df_topics.head()


Unnamed: 0_level_0,topic_1,topic_1_prop,topic_2,topic_2_prop,topic_3,topic_3_prop
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
34153941,5,0.249494,1.0,0.210527,0.0,0.196882
34153942,5,0.45984,2.0,0.289519,4.0,0.101766
34153964,3,0.65111,0.0,0.111941,1.0,0.082597
34153968,0,0.90958,1.0,0.030253,5.0,0.024919
34153978,4,0.552593,0.0,0.296178,3.0,0.087066


## Saving findings


In [7]:
df_topics.to_csv(os.path.join(RAW_DATA_PATH, "topics.csv"))


## Evaluating model


In [8]:
co_model = CoherenceModel(
    lda_model, texts=corpus, dictionary=params["id2word"], coherence="u_mass"
)
co_model.get_coherence()


-1.5014013547438871

## Saving the model


In [9]:
path = os.path.join(PKL_DATA_PATH, "lda_model.pkl")
lda_model.save(path)
