In [None]:
from load_data.utils import process_all_documents
from load_data import load_dataframe, process_text
from plots import plot_counter_lemmas
from plots.topics import interactive_exploration, plot_topics, plot_similarities
from topic_modeling.utils import check_coherence_for_topics_num
from topic_modeling.lda_model import find_best_model, find_best_topics_num
from topic_modeling.topic_probs import (
    get_similarities,
    get_topic_probs,
    calculate_distance_matrix,
    calculate_linkage_matrix,
    topic_probs_by_column_binded,
    tsne_dim_reduction,
)

import pandas as pd
from collections import Counter


In [None]:
paragraphs_names= {'Summary': ['Summary'],
 "National schemes": ['Description of national schemes providing minimum income support'],
 'Links with labour market activation': ['Links with labour market activation'],
 "Links to social services": ['Links to social services and integrated provision of targeted social services'],
 'Governance mechanisms': ['Governance mechanisms'],
 'Impact of minimum income schemes': ['Impact of minimum income schemes', 'Impact of MI schemes'],
 'Sources': ['Sources']}

In [None]:
df = process_all_documents('ue_raports/', paragraphs_names, 'ue_raports/txt_files/', 'Sources')
df.to_csv('documents.csv')
dft = load_dataframe('documents.csv')
processed_df = process_text(dft)

In [None]:
filter_dict = {'paragraph':'National schemes'}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)][
        "lemmas"
    ]
counter = Counter(filtered_lemmas.sum()).most_common(8)
common_words = [word for word, cnt in counter]
topic_numbers_range = (2, 11)

In [None]:
(filtered_lemmas, models, encoded_docs, lemmas_dictionary, cvs) = check_coherence_for_topics_num(
    processed_df,
    filter_dict,
    common_words,
    topic_numbers_range
)


In [None]:
num_topics = find_best_topics_num(cvs, topic_numbers_range)
lda_model = find_best_model(encoded_docs, lemmas_dictionary, cvs, topic_numbers_range)

In [None]:
interactive_exploration(lda_model, encoded_docs, lemmas_dictionary)

In [None]:
fig = plot_topics(lda_model, filtered_lemmas, int(num_topics/2), 2, " ", (12,5*int(num_topics/2)))

In [None]:
modeling_results, topic_probs = get_topic_probs(processed_df, filter_dict, lda_model, num_topics, encoded_docs)
linkage = calculate_linkage_matrix(topic_probs)
similarities = get_similarities(topic_probs)
dist_matrix = calculate_distance_matrix(topic_probs)

In [None]:
plot_similarities(similarities, topic_probs, linkage)

In [None]:
topics_by_country = topic_probs_by_column_binded(modeling_results, num_topics, column='country')
tsne_result = tsne_dim_reduction(topics_by_country, num_topics, perplexity=10)