In [1]:
from load_data import load_processed_data
from topic_modeling.model_optimizer import ModelOptimizer, save_data_for_app
from social_reports import COMMON_WORDS, PARAGRAPHS, STOPWORDS

from sentence_topic_analyser import SentenceTopicAnalyser
import json
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = './social_reports/'
processed_df = load_processed_data(data_path, stop_words=STOPWORDS)

In [3]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            # 👇️ alternatively use str()
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [4]:
save_path="./social_results/"
lda_alpha = 100

for paragraph_number in range(len(PARAGRAPHS)):
    filter_dict = {'paragraph': PARAGRAPHS[paragraph_number]}
    model_optimizer = ModelOptimizer(processed_df, filter_dict, COMMON_WORDS[PARAGRAPHS[paragraph_number]], (5,6), lda_alpha=lda_alpha)
    save_data_for_app(model_optimizer, path=save_path, perplexity=16)
    
    sentence_topic_analyser = SentenceTopicAnalyser(model_optimizer)
    df_to_summarize = model_optimizer.data
    sentences_processed = sentence_topic_analyser.process_documents(df_to_summarize.groupby("country")['tokens'].sum())
    country_sentence_dict = dict(zip(df_to_summarize.country.unique(), sentences_processed))
    json.dump(country_sentence_dict, open(save_path + f"{lda_alpha}_{PARAGRAPHS[paragraph_number].replace(' ', '_')}_essentials.json", 'w'), cls=NpEncoder)

100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
100%|██████████| 1/1 [00:02<00:00,  2.41s/it]


[t-SNE] Computing 24 nearest neighbors...
[t-SNE] Indexed 25 samples in 0.001s...
[t-SNE] Computed neighbors for 25 samples in 0.136s...
[t-SNE] Computed conditional probabilities for sample 25 / 25
[t-SNE] Mean sigma: 0.004333
[t-SNE] KL divergence after 250 iterations with early exaggeration: 43.454979
[t-SNE] KL divergence after 1000 iterations: 0.040317


100%|██████████| 1/1 [00:02<00:00,  2.04s/it]
100%|██████████| 1/1 [00:02<00:00,  2.65s/it]


[t-SNE] Computing 23 nearest neighbors...
[t-SNE] Indexed 24 samples in 0.001s...
[t-SNE] Computed neighbors for 24 samples in 0.005s...
[t-SNE] Computed conditional probabilities for sample 24 / 24
[t-SNE] Mean sigma: 0.028853
[t-SNE] KL divergence after 250 iterations with early exaggeration: 43.908348
[t-SNE] KL divergence after 1000 iterations: 0.202091


100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
100%|██████████| 1/1 [00:02<00:00,  2.53s/it]


[t-SNE] Computing 21 nearest neighbors...
[t-SNE] Indexed 22 samples in 0.000s...
[t-SNE] Computed neighbors for 22 samples in 0.006s...
[t-SNE] Computed conditional probabilities for sample 22 / 22
[t-SNE] Mean sigma: 0.008642
[t-SNE] KL divergence after 250 iterations with early exaggeration: 40.165207
[t-SNE] KL divergence after 1000 iterations: 0.457917


100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
100%|██████████| 1/1 [00:02<00:00,  2.53s/it]


[t-SNE] Computing 19 nearest neighbors...
[t-SNE] Indexed 20 samples in 0.000s...
[t-SNE] Computed neighbors for 20 samples in 0.006s...
[t-SNE] Computed conditional probabilities for sample 20 / 20
[t-SNE] Mean sigma: 0.005130
[t-SNE] KL divergence after 250 iterations with early exaggeration: 48.973068
[t-SNE] KL divergence after 1000 iterations: 0.005609


100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
100%|██████████| 1/1 [00:02<00:00,  2.41s/it]


[t-SNE] Computing 24 nearest neighbors...
[t-SNE] Indexed 25 samples in 0.000s...
[t-SNE] Computed neighbors for 25 samples in 0.005s...
[t-SNE] Computed conditional probabilities for sample 25 / 25
[t-SNE] Mean sigma: 0.004651
[t-SNE] KL divergence after 250 iterations with early exaggeration: 45.617977
[t-SNE] KL divergence after 1000 iterations: 0.054337


100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
100%|██████████| 1/1 [00:02<00:00,  2.51s/it]


[t-SNE] Computing 18 nearest neighbors...
[t-SNE] Indexed 19 samples in 0.001s...
[t-SNE] Computed neighbors for 19 samples in 0.006s...
[t-SNE] Computed conditional probabilities for sample 19 / 19
[t-SNE] Mean sigma: 0.007214
[t-SNE] KL divergence after 250 iterations with early exaggeration: 41.889931
[t-SNE] KL divergence after 1000 iterations: -0.058471
