In [None]:
from load_data.utils import process_all_documents
from load_data import load_dataframe, process_text
from plots import plot_counter_lemmas
from plots.topics import interactive_exploration, plot_topics, plot_similarities
from topic_modeling.utils import check_coherence_for_topics_num, tsne_dim_reduction, umap_dim_reduction, _topics_df
from topic_modeling.model_optimizer import ModelOptimizer, save_data_for_app

import pandas as pd
from collections import Counter
import pickle 

In [None]:
necp_processed = pd.read_csv('./necp_reports/necp_processed.csv', index_col = 0)
necp_processed.drop(['start_page', 'end_page', 'start_text', 'end_text'], axis = 1, inplace = True)
necp_processed.drop(necp_processed[necp_processed.isnull()["text"]].index, axis = 0, inplace = True)

In [None]:
countries_stop_words = ['Austria', 'Austrian', 'Belgium', 'Belgian', 'Bulgaria', 'Bulgarian', 'Czech', 'Cyprus', 'Cypriot', 'Germany', 'German',
                      'Denmark', 'Danish', 'Estonia', 'Estonian', 'Croatia', 'Croatian', 'Finland', 'Finnish', 'France', 'French', 'Malta', 'Maltese',
                      'Luxembourg', 'Lithuania', 'Lithuanian', 'Latvia', 'Latvian', 'Italy', 'Italian', 'Ireland', 'Irish', 'Hungary', 'Hungarian',
                      'Greece', 'Greek', 'Spain', 'Spanish', 'Netherlands', 'Dutch', 'Poland', 'Polish', 'Portugal', 'Portuguese', 'Romania', 'Romanian',
                      'Sweden', 'Swedish', 'Slovenia', 'Slovenian', 'Slovakia', 'Slovak']
extra_stop_words =  ['energy', 'figure', 'table', 'plan', "necp", 'national', 'use', "measure", "sector", "climate",
                     "plan", "dimension", "integrated", "section", "republic", "measures", "policies", "target", "objective", "policy",
                     "projection", "assessment", "federal", "government"]
stop_words = [c.lower() for c in countries_stop_words]
stop_words.extend(extra_stop_words)

In [4]:
df = pd.DataFrame(columns=["paragraph", "country", "text_path", "text", "tokens", "lemmas"])
df["country"] = necp_processed["country"]
df["text"] = necp_processed["text"]
df["paragraph"] = [row[1]["subsection"] if row[1]["subsection"] in ["Overview and Process for Establishing the Plan", "Impact Assessment of Planned Policies and Measures"] else row[1]["energy_union_dimension"] for row in necp_processed.iterrows()]
processed_df = process_text(df, spacy_model="en_core_web_md",  stop_words=stop_words)

In [None]:
paragraphs = ["Overview and Process for Establishing the Plan",
              "Impact Assessment of Planned Policies and Measures",
              "Decarbonisation",
              "Energy efficiency",
              "Energy security",
              "Internal market",
              "R&I and Competitiveness"]
common_words = {}

filter_dict = {'paragraph':paragraphs[0]}
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(1)
common_words[paragraphs[0]] = [word for word, cnt in counter]

filter_dict = {'paragraph':paragraphs[1]}
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(3)
common_words[paragraphs[1]] = [word for word, cnt in counter]

filter_dict = {'paragraph':paragraphs[2]}
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(5)
common_words[paragraphs[2]] = [word for word, cnt in counter]

filter_dict = {'paragraph':paragraphs[3]}
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(3)
common_words[paragraphs[3]] = [word for word, cnt in counter]

filter_dict = {'paragraph':paragraphs[4]}
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(2)
common_words[paragraphs[4]] = [word for word, cnt in counter]

filter_dict = {'paragraph':paragraphs[5]}
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(4)
common_words[paragraphs[5]] = [word for word, cnt in counter]

filter_dict = {'paragraph':paragraphs[6]}
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(3)
common_words[paragraphs[6]] = [word for word, cnt in counter]

In [None]:
filter_dict = {'paragraph':paragraphs[1]}
model_optimizer = ModelOptimizer(processed_df, filter_dict, common_words[paragraphs[1]], (4,6))


In [None]:
save_data_for_app(model_optimizer)