In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import operator

In [None]:
term_document_train = pd.read_csv("assets/td_train.tsv", index_col=0)
term_document_test = pd.read_csv("assets/td_test.tsv", index_col=0)

In [None]:
def top_words_per_topic(lda):
    words = np.array(term_document_train.columns)
    for key, topic in enumerate(lda.components_):
        print(f"Топ 10 слов для топика {key}:")
        print([words[index] for index in topic.argsort()[-10:]])
    print("")

In [None]:
def top_documents_per_topic(lda, pred):
    index_names = ['Topic' + str(i) for i in range(lda.n_components)]
    column_names = term_document_test.index
    df_topic_document = pd.DataFrame(np.transpose(pred), columns=column_names, index=index_names)
    df_topic_document = pd.DataFrame(
        df_topic_document.apply(lambda x: list(df_topic_document.columns[np.array(x).argsort()[::-1][:5]]),
                                axis=1).to_list(), columns=['1', '2', '3', '4', '5'])
    print(df_topic_document.to_string(), end='\n\n')

In [None]:
perplexities = {}
def calculate_LDA(n):
    lda = LDA(n_components=n)
    lda = lda.fit(term_document_train)
    pred = lda.transform(term_document_test)
    perplexity = lda.perplexity(term_document_test)
    print(f"Topic number: {lda.n_components}, perplexity: {perplexity}")
    perplexities[n] = round(perplexity, 2)

    top_words_per_topic(lda)
    top_documents_per_topic(lda, pred)

In [None]:
calculate_LDA(2)

In [None]:
calculate_LDA(5)

In [None]:
calculate_LDA(10)

In [None]:
calculate_LDA(20)

In [None]:
calculate_LDA(40)

In [None]:
perplexities

In [None]:
n_topics = list(perplexities.keys())
perplexity = list(perplexities.values())
plt.figure(figsize=(16, 10))
plt.plot(n_topics, perplexity)
plt.grid(True)
plt.show()

In [None]:
max = 0.0
best_degree = 3
for degree in range(1, 6):
    model = np.poly1d(np.polyfit(n_topics, perplexity, degree))
    r2 = r2_score(perplexity, model(n_topics))
    if r2 > max:
        max = r2
        best_degree = degree
print(best_degree)

In [None]:
model = np.poly1d(np.polyfit(n_topics, perplexity, best_degree))
line = np.linspace(1, 42)
plt.scatter(n_topics, perplexity)
plt.plot(line, model(line))
plt.show()

In [None]:
result = {}
for iter in [5, 10, 20]:
    lda_model = LDA(n_components=20, max_iter=iter)
    lda_model = lda_model.fit(term_document_train)
    perplexity = lda_model.perplexity(term_document_test)
    print(f"N_components: {lda_model.n_components}, max_iter: {iter}, perplexity: {perplexity}")
    pred = lda_model.transform(term_document_test)
    result[iter] = (round(perplexity, 2), pred, iter)
result = sorted(result.values(), key=operator.itemgetter(0))
print(f"Best perplexity = {result[0][0]} ({result[0][2]} iter)")
best_pred = result[0][1]

In [None]:
with open(f"assets/test_topics.tsv", 'w') as file:
    rows = ""
    for predict, filename in zip(best_pred, term_document_test.index):
        string = ""
        for pred in predict:
            string += '\t' + str(round(pred, 3))
        rows += filename + '\t' + string[1:] + '\n'
    file.write(rows)