In [None]:
from load_data.utils import process_all_documents
from load_data import load_dataframe, process_text
from plots import plot_counter_lemmas

from load_data import load_processed_data

import pandas as pd
from collections import Counter
import pickle 
import time
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
import ast
import pyLDAvis
import glob 
import numpy as np
from tqdm import tqdm
import os

In [None]:
paragraphs_names= {'Summary': ['Summary'],
 "National schemes": ['Description of national schemes providing minimum income support'],
 'Links with labour market activation': ['Links with labour market activation'],
 "Links to social services": ['Links to social services and integrated provision of targeted social services'],
 'Governance mechanisms': ['Governance mechanisms'],
 'Impact of minimum income schemes': ['Impact of minimum income schemes', 'Impact of MI schemes'],
 'Sources': ['Sources']}

In [None]:
df = process_all_documents('social_reports/', paragraphs_names, 'social_reports/txt_files/', 'Sources')
df.to_csv('documents.csv')
dft = load_dataframe('documents.csv')
processed_df = process_text(dft)

common_words = {}

In [None]:
df_p = load_processed_data(data_path, stop_words=STOPWORDS)

## Common words

### Summary

In [None]:
par_name = 'Summary'
filter_dict = {'paragraph': par_name}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
number_words_filtered = 5
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(number_words_filtered)
common_words[par_name] = [word for word, cnt in counter]

### National schemes

In [None]:
par_name = 'National schemes'
filter_dict = {'paragraph': par_name}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
number_words_filtered = 5
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(number_words_filtered)
common_words[par_name] = [word for word, cnt in counter]

### Links with labour market activation

In [None]:
par_name = 'Links with labour market activation'
filter_dict = {'paragraph': par_name}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
number_words_filtered = 10
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(number_words_filtered)
common_words[par_name] = [word for word, cnt in counter]

### Links to social services

In [None]:
par_name = 'Links to social services'
filter_dict = {'paragraph': par_name}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
number_words_filtered = 7
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(number_words_filtered)
common_words[par_name] = [word for word, cnt in counter]

### Governance mechanisms

In [None]:
par_name = 'Governance mechanisms'
filter_dict = {'paragraph': par_name}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
number_words_filtered = 6
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(number_words_filtered)
common_words[par_name] = [word for word, cnt in counter]

### Impact of minimum income schemes

In [None]:
par_name = 'Impact of minimum income schemes'
filter_dict = {'paragraph': par_name}
plot_counter_lemmas(processed_df, filter_dict)

In [None]:
number_words_filtered = 6
filtered_lemmas = processed_df.loc[(processed_df[list(filter_dict)] == pd.Series(filter_dict)).all(axis=1)]["lemmas"]
counter = Counter(filtered_lemmas.sum()).most_common(number_words_filtered)
common_words[par_name] = [word for word, cnt in counter]

## Pipeline

In [None]:
topic_numbers_range = (3, 10)
alpha = 100
results_folder = "./social_results/lda/"

In [None]:
def do_pipeline(par, alpha):
    print(f"Pipeline for {par} with alpha={alpha} started")
    filter_dict = {'paragraph': par}
    filter_dict = {'paragraph': par}
    (filtered_lemmas, models, encoded_docs, lemmas_dictionary, cvs) = check_coherence_for_topics_num(
        processed_df,
        filter_dict,
        common_words[par],
        topic_numbers_range,
        alpha
    )
    num_topics = find_best_topics_num(cvs, topic_numbers_range)
    print(f"Best number of topics found: {num_topics}")
    lda_model = find_best_model(encoded_docs, lemmas_dictionary, cvs, topic_numbers_range, random_state=42, alpha=alpha)
    encoded_docs.to_csv(str(alpha) + "_" + par.replace(" ", "_") +"_encoded_docs.csv")
    lemmas_dictionary.save(str(alpha) + "_" + par.replace(" ", "_") +"_dictionary.dict")
    lda_model.save(str(alpha) + "_" + par.replace(" ", "_") +"_lda_model.model")
    print("Best model found and saved")
    topic_words = _topics_df(lda_model, filtered_lemmas, 30)
    modeling_results, topic_probs = get_topic_probs(processed_df, filter_dict, lda_model, num_topics, encoded_docs)
    topics_by_country = topic_probs_by_column_binded(modeling_results, num_topics, column='country')
    topics_by_country.to_csv(str(alpha) + "_" + par.replace(" ", "_") +"_probs.csv")
    tsne_mapping = tsne_dim_reduction(topics_by_country, num_topics * 3, perplexity=10)
    umap_mapping = umap_dim_reduction(topics_by_country, num_topics * 3, random_state=42)
    mappings = tsne_mapping.join(umap_mapping)
    mappings.to_csv(str(alpha) + "_" + par.replace(" ", "_") +"_mapping.csv")
    topic_words.to_csv(str(alpha) + "_" + par.replace(" ", "_") +"_topic_words.csv")

In [None]:
for key in paragraphs_names.keys():
        if key == "Sources":
                continue
        do_pipeline(key, alpha)

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")
gpt3_model = "text-davinci-002"
temperature = 0.6

In [None]:
result_files_list = glob.glob("./social_results/lda/*probs.csv")

In [None]:
for result_file in tqdm(result_files_list): 
    topic_df = pd.read_csv(result_file)
    topic_keywords = pd.read_csv(result_file.replace("probs", "topic_words"))
    colnames = topic_df.columns.to_list()
    topic_colnames = colnames[1:-4]
    n_topics = len(topic_colnames)
    for i, colname in enumerate(topic_colnames[:n_topics]):
        time.sleep(1)
        n_keywords = np.min([np.sum(topic_keywords["topic_id"] == int(colname)), 25])
        keywords = topic_keywords[topic_keywords["topic_id"] == int(colname)].word.to_list()[:n_keywords]
        weights = topic_keywords[topic_keywords["topic_id"] == int(colname)].importance.to_list()[:n_keywords]
        prompt = _generate_prompt(keywords, weights) 
        title = _generate_title(prompt, gpt3_model, temperature)
        topic_colnames[i] = title
    colnames[1:-4] = topic_colnames
    topic_df.columns = colnames
    topic_df.to_csv(result_file, index=False)

In [None]:
for result_file in tqdm(result_files_list): 
    lda_model = LdaModel.load(result_file.replace("_probs.csv", "_lda_model.model"))
    encoded_docs = pd.read_csv(result_file.replace("_probs.csv", "_encoded_docs.csv")).set_index("Unnamed: 0")
    encoded_docs.index.name = None
    encoded_docs = encoded_docs.lemmas
    encoded_docs = encoded_docs.apply(lambda x: ast.literal_eval(x))
    lemmas_dictionary = Dictionary.load(result_file.replace("_probs.csv", "_dictionary.dict"))
    vis = pyLDAvis.gensim_models.prepare(lda_model, encoded_docs, lemmas_dictionary)
    vis_html_string = pyLDAvis.prepared_data_to_html(vis)
    with open(result_file.replace("_probs.csv", "_vis.txt"), "w") as text_file:
        text_file.write(vis_html_string)