In [81]:
import os
import ast

# # Add ../src to Python path
# import sys
# project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '../src'))
# sys.path.insert(0, project_root)

import pandas as pd
from joblib import Parallel, delayed

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore, HdpModel
from gensim.models.coherencemodel import CoherenceModel

In [98]:
# config
local_data_path = "/Users/xwpeng/Projects/Erdos_NewsFin/data"
dict_no_below = 10
dict_no_above = 0.5
dict_keep_n = 10000
topics = [300]
topn = 100

keep = 100000

In [83]:
# load data
dataset = ["2016.csv", "2017.csv", "2018.csv"]
df = []
for file in dataset:
	df.append(pd.read_csv(
		os.path.join(
			local_data_path,
			"data_clean/All_news_years",
			file
		)
	))
data_all = pd.concat(df, ignore_index=True)

In [84]:
# sample data
data = data_all.sample(keep, random_state=42)

In [85]:
# Select the columns
data = data[["date", "token"]]
data = data.sort_values(by="date")
data = data.reset_index(drop=True)
data['token'] = data['token'].apply(lambda row: [str(token) for token in ast.literal_eval(row)])

In [86]:
# Training data
dict = Dictionary(data["token"])
dict.filter_extremes(
	no_below = dict_no_below,
	no_above = dict_no_above,
	keep_n = dict_keep_n)
corpus = [dict.doc2bow(doc) for doc in data["token"]]

In [108]:
# loop over different pre-defined number of topics
coherence_score = []
describe = []
for t in topics:
	# fit the HDP model
	hdp_model = HdpModel(
		corpus = corpus,
		id2word = dict,
		T = t,
		alpha = 1,
		gamma = 50)
	model = hdp_model.suggested_lda_model()
	# compute the coherence score
	coherence_model = CoherenceModel(
		model = model,
		texts = data["token"],
		corpus = corpus,
		dictionary = dict,
		coherence = 'c_v')
	coherence_score.append(round(coherence_model.get_coherence(), 4))
	# predict the topics
	topics_inference = model.get_document_topics(corpus)
	def sorted_topics(topics):
		return sorted(topics, key=lambda x: x[1], reverse=True)
	topics_inference_ = Parallel(n_jobs=-1)(delayed(sorted_topics)(topics) for topics in topics_inference)
	key_topic = [topics[0][0] for topics in topics_inference_]
	df = pd.DataFrame([topics[0][0] for topics in topics_inference_], columns=["key_topic"])
	df = df.groupby('key_topic').size()
	describe.append(df.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))



In [88]:
coherence_score

[0.3232, 0.3157, 0.3129, 0.3185, 0.3131, 0.3123]

In [89]:
pd.concat(describe, axis=1)

Unnamed: 0,0,1,2,3,4,5
count,16.0,24.0,33.0,67.0,85.0,99.0
mean,6250.0,4166.666667,3030.30303,1492.537313,1176.470588,1010.10101
std,10960.74849,6959.316069,5173.085398,4811.539366,3333.903306,2782.151372
min,1.0,1.0,1.0,1.0,1.0,1.0
10%,1.0,1.0,1.0,1.0,1.0,1.0
20%,1.0,1.0,1.0,1.0,1.0,1.0
30%,1.0,1.0,1.6,1.0,1.0,1.0
40%,8.0,1.2,2.0,1.0,1.0,1.0
50%,15.0,2.0,3.0,1.0,1.0,2.0
60%,33.0,2.0,4.4,2.0,2.0,2.0


In [95]:
coherence_score

[0.3107, 0.3133, 0.3173]

In [96]:
pd.concat(describe, axis=1)

Unnamed: 0,0,1,2
count,20.0,53.0,93.0
mean,5000.0,1886.792453,1075.268817
std,7722.54671,4326.121421,3033.715859
min,1.0,1.0,1.0
10%,1.0,1.0,1.0
20%,1.0,1.0,1.0
30%,1.0,1.0,1.0
40%,1.6,1.0,1.0
50%,3.0,1.0,2.0
60%,11.4,2.0,3.0


In [109]:
coherence_score

[0.3127]

In [112]:
pd.concat(describe, axis=1)

Unnamed: 0,0
count,49.0
mean,2040.816327
std,4131.089675
min,1.0
10%,1.0
20%,1.0
30%,1.0
40%,1.0
50%,2.0
60%,3.8


In [106]:
coherence_score

[0.313]

In [107]:
pd.concat(describe, axis=1)

Unnamed: 0,0
count,76.0
mean,1315.789474
std,3518.176362
min,1.0
10%,1.0
20%,1.0
30%,1.0
40%,1.0
50%,1.0
60%,2.0
