In [1]:
from load_data.utils import process_all_documents
from load_data import load_dataframe, process_text
from plots import plot_counter_lemmas
from plots.topics import interactive_exploration, plot_topics, plot_similarities
from topic_modeling.utils import check_coherence_for_topics_num, tsne_dim_reduction, umap_dim_reduction, _topics_df
from topic_modeling.lda_model import find_best_model, find_best_topics_num
from topic_modeling.topic_probs import (
    get_similarities,
    get_topic_probs,
    calculate_distance_matrix,
    calculate_linkage_matrix,
    topic_probs_by_column_binded,
)

import pandas as pd
from collections import Counter
import pickle 

ImportError: cannot import name 'ArrayObject' from 'PyPDF2.generic' (unknown location)

In [None]:
necp_processed = pd.read_csv('./necp_reports/necp_processed.csv', index_col = 0)
necp_processed.drop(['start_page', 'end_page', 'start_text', 'end_text'], axis = 1, inplace = True)
necp_processed.drop(necp_processed[necp_processed.isnull()["text"]].index, axis = 0, inplace = True)

In [None]:
df = pd.DataFrame(columns=["paragraph", "country", "text_path", "text", "tokens", "lemmas"])
df["country"] = necp_processed["country"]
df["text"] = necp_processed["text"]
df["paragraph"] = [row[1]["subsection"] if row[1]["subsection"] in ["Overview and Process for Establishing the Plan", "Impact Assessment of Planned Policies and Measures"] else row[1]["energy_union_dimension"] for row in necp_processed.iterrows()]
processed_df = process_text(df, spacy_model="en_core_web_md")

In [None]:
paragraphs = ["Overview and Process for Establishing the Plan",
              "Impact Assessment of Planned Policies and Measures",
              "Decarbonisation",
              "Energy efficiency",
              "Energy security",
              "Internal market",
              "R&I and Competitiveness"]
common_words = {}

### Paragraph: Overview and Process for Establishing the Plan

In [None]:
filter_dict = {'paragraph':paragraphs[0]}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(3)
common_words[paragraphs[0]] = [word for word, cnt in counter]

### Paragraph: Impact Assessment of Planned Policies and Measures

In [None]:
filter_dict = {'paragraph':paragraphs[1]}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(4)
common_words[paragraphs[1]] = [word for word, cnt in counter]

### Paragraph: Decarbonisation

In [None]:
filter_dict = {'paragraph':paragraphs[2]}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(5)
common_words[paragraphs[2]] = [word for word, cnt in counter]

### Paragraph: Energy efficiency

In [None]:
filter_dict = {'paragraph':paragraphs[3]}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(7)
common_words[paragraphs[3]] = [word for word, cnt in counter]

### Paragraph: Energy security

In [None]:
filter_dict = {'paragraph':paragraphs[4]}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(7)
common_words[paragraphs[4]] = [word for word, cnt in counter]

### Paragraph: Internal market

In [None]:
filter_dict = {'paragraph':paragraphs[5]}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(5)
common_words[paragraphs[5]] = [word for word, cnt in counter]

### Paragraph: R&I and Competitiveness

In [None]:
filter_dict = {'paragraph':paragraphs[6]}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(5)
common_words[paragraphs[6]] = [word for word, cnt in counter]

In [None]:
topic_numbers_range = (3, 10)

In [None]:
def do_pipeline(par, alpha):
    print(f"Pipeline for {par} with alpha={alpha} started")
    filter_dict = {'paragraph': par}
    (filtered_lemmas, models, encoded_docs, lemmas_dictionary, cvs) = check_coherence_for_topics_num(
        processed_df,
        filter_dict,
        common_words[par],
        topic_numbers_range,
        alpha
    )
    num_topics = find_best_topics_num(cvs, topic_numbers_range)
    print(f"Best number of topics found: {num_topics}")
    lda_model = find_best_model(encoded_docs, lemmas_dictionary, cvs, topic_numbers_range, random_state=42, alpha=alpha)
    encoded_docs.to_csv(str(alpha) + "_" + par.replace(" ", "_") +"_encoded_docs.csv")
    lemmas_dictionary.save(str(alpha) + "_" + par.replace(" ", "_") +"_dictionary.dict")
    lda_model.save(str(alpha) + "_" + par.replace(" ", "_") +"_lda_model.model")
    print("Best model found and saved")
    topic_words = _topics_df(lda_model, filtered_lemmas, 30)
    modeling_results, topic_probs = get_topic_probs(processed_df, filter_dict, lda_model, num_topics, encoded_docs)
    topics_by_country = topic_probs_by_column_binded(modeling_results, num_topics, column='country')
    result = tsne_dim_reduction(topics_by_country, num_topics * 3, perplexity=10)
    result[["u1", "u2"]] = umap_dim_reduction(result.iloc[:,:(num_topics * 3 + 1)], num_topics * 3, random_state=42)[["c1", "c2"]]
    result.to_csv(str(alpha) + "_" + par.replace(" ", "_") +"_probs.csv")
    topic_words.to_csv(str(alpha) + "_" + par.replace(" ", "_") +"_topic_words.csv")

In [None]:
for alpha in [50, 100, 150, 200, 250]:
    for par_idx in range(7):
        do_pipeline(paragraphs[par_idx], alpha)

In [None]:
from topic_modeling.topic_names import _generate_prompt, _generate_title
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")
gpt3_model = "text-davinci-002"
temperature = 0.6

In [None]:
import glob 
import numpy as np
from tqdm import tqdm
result_files_list = glob.glob("../climate_results/*probs.csv")

In [None]:
import time

In [None]:
for result_file in tqdm(result_files_list[24:]): 
    topic_df = pd.read_csv(result_file)
    topic_keywords = pd.read_csv(result_file.replace("probs", "topic_words"))
    colnames = topic_df.columns.to_list()
    topic_colnames = colnames[1:-4]
    one_section_flag = False
    if "Overview" in result_file or "Impact" in result_file:
        one_section_flag = True
        n_topics = len(topic_colnames)
    else:
        n_topics = int(len(topic_colnames)/3)
    for i, colname in enumerate(topic_colnames[:n_topics]):
        time.sleep(1)
        n_keywords = np.min([np.sum(topic_keywords["topic_id"] == int(colname)), 20])
        keywords = topic_keywords[topic_keywords["topic_id"] == int(colname)].word.to_list()[:n_keywords]
        weights = topic_keywords[topic_keywords["topic_id"] == int(colname)].importance.to_list()[:n_keywords]
        prompt = _generate_prompt(keywords, weights) 
        title = _generate_title(prompt, gpt3_model, temperature)
        topic_colnames[i] = title
    if not one_section_flag:
        topic_colnames_final = []
        for subsection in ["NO&T ", "P&M ", "CS&RP "]:
            topic_colnames_final += [subsection + topic_name for topic_name in topic_colnames[:n_topics]]
        topic_colnames = topic_colnames_final
    colnames[1:-4] = topic_colnames
    topic_df.columns = colnames
    topic_df.to_csv(result_file, index=False)

In [None]:
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
import ast
import pyLDAvis
for result_file in tqdm(result_files_list): 
    lda_model = LdaModel.load(result_file.replace("_probs.csv", "lda_model.model"))
    encoded_docs = pd.read_csv(result_file.replace("_probs.csv", "_encoded_docs.csv")).set_index("Unnamed: 0")
    encoded_docs.index.name = None
    encoded_docs = encoded_docs.lemmas
    encoded_docs = encoded_docs.apply(lambda x: ast.literal_eval(x))
    lemmas_dictionary = Dictionary.load(result_file.replace("_probs.csv", "dictionary.dict"))
    vis = pyLDAvis.gensim_models.prepare(lda_model, encoded_docs, lemmas_dictionary)
    vis_html_string = pyLDAvis.prepared_data_to_html(vis)
    with open(result_file.replace("_probs.csv", "_vis.txt"), "w") as text_file:
        text_file.write(vis_html_string)