In [1]:
from load_data import load_processed_data
from topic_modeling.model_optimizer import ModelOptimizer, save_data_for_app
from necp_reports import COMMON_WORDS, PARAGRAPHS, STOPWORDS, DIMENSIONS

from sentence_topic_analyser import SentenceTopicAnalyser
import json
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = './necp_reports/'
processed_df = load_processed_data(data_path, stop_words=STOPWORDS)

In [3]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            # 👇️ alternatively use str()
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [5]:
save_path="./necp_results/"
lda_alpha = 100

for dimension_number in range(len(DIMENSIONS)):
    filter_dict = {'energy_union_dimension': DIMENSIONS[dimension_number]}
    model_optimizer = ModelOptimizer(processed_df, filter_dict, COMMON_WORDS[DIMENSIONS[dimension_number]], (5,6), lda_alpha=lda_alpha)
    save_data_for_app(model_optimizer, path=save_path, perplexity=20)

    sentence_topic_analyser = SentenceTopicAnalyser(model_optimizer)
    df_to_summarize = model_optimizer.data.loc[model_optimizer.data['subsection'].isin(["National Objectives and Targets", "Policies and Measures"])]
    sentences_processed = sentence_topic_analyser.process_documents(df_to_summarize.groupby("country")['tokens'].sum())
    country_sentence_dict = dict(zip(df_to_summarize.country.unique(), sentences_processed))
    json.dump(country_sentence_dict, open(save_path + f"{lda_alpha}_{DIMENSIONS[dimension_number].replace(' ', '_')}_essentials.json", 'w'), cls=NpEncoder)

for paragraph_number in range(len(PARAGRAPHS)):
    filter_dict = {'subsection': PARAGRAPHS[paragraph_number]}
    model_optimizer = ModelOptimizer(processed_df, filter_dict, COMMON_WORDS[PARAGRAPHS[paragraph_number]], (5,6), lda_alpha=lda_alpha)
    save_data_for_app(model_optimizer, path=save_path, perplexity=20)

    sentence_topic_analyser = SentenceTopicAnalyser(model_optimizer)
    df_to_summarize = model_optimizer.data
    sentences_processed = sentence_topic_analyser.process_documents(df_to_summarize.groupby("country")['tokens'].sum())
    country_sentence_dict = dict(zip(df_to_summarize.country.unique(), sentences_processed))
    json.dump(country_sentence_dict, open(save_path + f"{lda_alpha}_{PARAGRAPHS[paragraph_number].replace(' ', '_')}_essentials.json", 'w'), cls=NpEncoder)

100%|██████████| 1/1 [00:19<00:00, 19.57s/it]
100%|██████████| 1/1 [00:34<00:00, 34.57s/it]


[t-SNE] Computing 24 nearest neighbors...
[t-SNE] Indexed 25 samples in 0.000s...
[t-SNE] Computed neighbors for 25 samples in 0.020s...
[t-SNE] Computed conditional probabilities for sample 25 / 25
[t-SNE] Mean sigma: 0.562333
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.040001
[t-SNE] KL divergence after 1000 iterations: 0.428693


100%|██████████| 1/1 [00:12<00:00, 12.51s/it]
100%|██████████| 1/1 [00:26<00:00, 26.39s/it]


[t-SNE] Computing 25 nearest neighbors...
[t-SNE] Indexed 26 samples in 0.000s...
[t-SNE] Computed neighbors for 26 samples in 0.016s...
[t-SNE] Computed conditional probabilities for sample 26 / 26
[t-SNE] Mean sigma: 0.281031
[t-SNE] KL divergence after 250 iterations with early exaggeration: 42.263252
[t-SNE] KL divergence after 800 iterations: 0.295649


100%|██████████| 1/1 [00:10<00:00, 10.18s/it]
100%|██████████| 1/1 [00:13<00:00, 13.98s/it]


[t-SNE] Computing 25 nearest neighbors...
[t-SNE] Indexed 26 samples in 0.000s...
[t-SNE] Computed neighbors for 26 samples in 0.009s...
[t-SNE] Computed conditional probabilities for sample 26 / 26
[t-SNE] Mean sigma: 0.131894
[t-SNE] KL divergence after 250 iterations with early exaggeration: 42.790466
[t-SNE] KL divergence after 900 iterations: 0.012158


100%|██████████| 1/1 [00:12<00:00, 12.58s/it]
100%|██████████| 1/1 [00:16<00:00, 16.09s/it]


[t-SNE] Computing 25 nearest neighbors...
[t-SNE] Indexed 26 samples in 0.000s...
[t-SNE] Computed neighbors for 26 samples in 0.008s...
[t-SNE] Computed conditional probabilities for sample 26 / 26
[t-SNE] Mean sigma: 0.191513
[t-SNE] KL divergence after 250 iterations with early exaggeration: 41.921574
[t-SNE] KL divergence after 800 iterations: 0.014935


100%|██████████| 1/1 [00:08<00:00,  8.60s/it]
100%|██████████| 1/1 [00:10<00:00, 10.03s/it]


[t-SNE] Computing 25 nearest neighbors...
[t-SNE] Indexed 26 samples in 0.000s...
[t-SNE] Computed neighbors for 26 samples in 0.009s...
[t-SNE] Computed conditional probabilities for sample 26 / 26
[t-SNE] Mean sigma: 0.231719
[t-SNE] KL divergence after 250 iterations with early exaggeration: 45.500134
[t-SNE] KL divergence after 1000 iterations: 0.012732


