# Generate dictionary for self-refine method 

## Overview
1. Imports
    - Import libraries
    - Import NEWTS dataset
    - Import LDA model
    
2. Create dictionary

# 0. Hyperparameters

In [1]:
# number of top k words to describe the topic
top_k = 20

# number of articles in dictionary (1 - 2400)
article_num = 50

# 1. Imports
## Import libraries

In [2]:
import torch
import gensim
from gensim import corpora

## Import the NEWTS dataset

In [3]:
from NEWTS import read

newts_train = read.read_train()
newts_test = read.read_test()

In [26]:
from collections import defaultdict
import pandas as pd


def count_and_store_non_ascii_characters(newts_df):
    # Dictionary to store non-ASCII characters and their counts
    non_ascii_characters = defaultdict(int)

    # Define columns that contain text to be inspected
    text_columns = ["article", "summary1", "summary2"]

    # Iterate through each row in the DataFrame
    for _, row in newts_df.iterrows():
        # Iterate through each specified text column
        for column in text_columns:
            text = row[column]
            if isinstance(text, str):  # Ensure the value is a string
                # Iterate through each character in the string
                for character in text:
                    # Check if the character is non-ASCII
                    if ord(character) > 127:
                        non_ascii_characters[character] += 1

    return non_ascii_characters

In [27]:
# Use the function to count non-ASCII characters in the DataFrame
non_ascii_characters_counts = count_and_store_non_ascii_characters(newts_train)

# Print the non-ASCII characters and their counts
for character, count in non_ascii_characters_counts.items():
    print(
        f"Character '{character}' (Unicode: U+{ord(character):04X}) appears {count} times."
    )

Character '£' (Unicode: U+00A3) appears 2605 times.
Character '‚' (Unicode: U+201A) appears 4387 times.
Character 'ö' (Unicode: U+00F6) appears 3988 times.
Character 'Ñ' (Unicode: U+00D1) appears 3564 times.
Character '∫' (Unicode: U+222B) appears 219 times.
Character 'π' (Unicode: U+03C0) appears 215 times.
Character '≤' (Unicode: U+2264) appears 2818 times.
Character '®' (Unicode: U+00AE) appears 101 times.
Character '–' (Unicode: U+2013) appears 1019 times.
Character 'à' (Unicode: U+00E0) appears 404 times.
Character 'Æ' (Unicode: U+00C6) appears 194 times.
Character '¨' (Unicode: U+00A8) appears 371 times.
Character 'á' (Unicode: U+00E1) appears 109 times.
Character '∏' (Unicode: U+220F) appears 13 times.
Character 'â' (Unicode: U+00E2) appears 44 times.
Character '±' (Unicode: U+00B1) appears 69 times.
Character '¢' (Unicode: U+00A2) appears 108 times.
Character 'î' (Unicode: U+00EE) appears 2 times.
Character '∞' (Unicode: U+221E) appears 13 times.
Character '´' (Unicode: U+00B4)

## Import LDA model

In [4]:
def load_lda_model(model_address: str):
    # Loads the LDA model and dictionary from the specified address.
    try:
        lda = gensim.models.ldamodel.LdaModel.load(
            model_address + "lda.model", mmap="r"
        )
        dictionary = corpora.Dictionary.load(model_address + "dictionary.dic", mmap="r")
        return lda, dictionary
    except Exception as e:
        print(f"Error loading model or dictionary: {e}")
        return None, None

In [5]:
model_address = "LDA_250/"
lda, dictionary = load_lda_model(model_address)
# Warning "WARNING:root:random_state not set so using default value" is inconsequential for inference



# 2. Utility functions

In [6]:
def get_top_topic_words(lda, topic_id, num_words):
    """
    Returns the top words for a given topic from the LDA model.

    :param lda: The LDA model.
    :param topic_id: The topic number to get the top words for.
    :param num_words: The number of top words to return.
    :return: A list of top words for the specified topic.
    """
    try:
        # Get the specified topic. Note: num_words here limits the number of words returned for the topic.
        topic_words = lda.show_topic(topic_id, num_words)

        # Extract just the words
        top_words = [word for word, prob in topic_words]
        return top_words
    except Exception as e:
        print(f"Error in getting top topic words: {e}")
        return []

In [7]:
def precompute_top_k_words_for_all_topics(lda, top_k=top_k):
    """
    Precompute the top-k words for all topics in the LDA model.

    :param lda: The LDA model.
    :param top_k: The number of top words to precompute for each topic.
    :return: A dictionary mapping topic IDs to their top-k words.
    """
    topic_words = {}
    for topic_id in range(lda.num_topics):
        topic_words[topic_id] = get_top_topic_words(lda, topic_id, num_words=top_k)
    return topic_words

In [8]:
# Precompute top-k words for all topics
top_k_words = precompute_top_k_words_for_all_topics(lda, top_k=top_k)

In [9]:
def generate_summary_dicts(newts_train, lda, dictionary, top_k_words, article_num):
    """
    Generate summary dictionaries using precomputed top-k words for each topic.

    :param newts_train: The NEWTS training dataset.
    :param lda: The LDA model.
    :param dictionary: The dictionary of the LDA model (not used in this function but included for consistency).
    :param top_k_words: A dictionary mapping topic IDs to their precomputed top-k words.
    :return: A list of dictionaries, each containing an article, one of its summaries, and the top-k words for the associated topic.
    """
    summary_dicts = []

    # Iterate through article_num articles in the NEWTS training dataset
    for _, row in newts_train[:article_num].iterrows():
        # Extract article, summaries, and topic ids
        article = row["article"]
        summary1 = row["summary1"]
        summary2 = row["summary2"]
        tid1 = row["tid1"]
        tid2 = row["tid2"]

        # Retrieve precomputed top-k words for each topic id
        top_k_words_tid1 = top_k_words[tid1]
        top_k_words_tid2 = top_k_words[tid2]

        # Create dictionary for tid1 and summary1
        dict1 = {"document": article, "summary": summary1, "words": top_k_words_tid1}

        # Create dictionary for tid2 and summary2
        dict2 = {"document": article, "summary": summary2, "words": top_k_words_tid2}

        # Append dictionaries to the list
        summary_dicts.append(dict1)
        summary_dicts.append(dict2)

    return summary_dicts

In [10]:
summary_dicts = generate_summary_dicts(
    newts_train, lda, dictionary, top_k_words, article_num
)

# Validate the length to ensure it matches the expected number of entries
print(f"Total entries created: {len(summary_dicts)}")

Total entries created: 100


In [11]:
print(summary_dicts[0])



In [19]:
from collections import defaultdict


def count_and_store_non_ascii_characters(dict_list):
    # Dictionary to store non-ASCII characters and their counts
    non_ascii_characters = defaultdict(int)

    # Iterate through each dictionary in the list
    for dictionary in dict_list:
        # Iterate through all string values in the dictionary
        for text in dictionary.values():
            if isinstance(text, str):  # Ensure the value is a string
                # Iterate through each character in the string
                for character in text:
                    # Check if the character is non-ASCII
                    if ord(character) > 127:
                        non_ascii_characters[character] += 1

    return non_ascii_characters

In [23]:
# Count non-ASCII characters in the dictionary and store them
non_ascii_characters = count_and_store_non_ascii_characters(summary_dicts)

# Output the characters and their counts
for character, count in non_ascii_characters.items():
    print(
        f"Character '{character}' (Unicode: U+{ord(character):04X}) appears {count} times."
    )

# If you want to know the total count of non-ASCII characters
total_non_ascii_characters = sum(non_ascii_characters.values())
print(f"Total count of non-ASCII characters: {total_non_ascii_characters}")

Total count of non-ASCII characters: 0


In [21]:
def remove_non_ascii_characters(list_of_dicts):
    for dictionary in list_of_dicts:
        for key, value in dictionary.items():
            if isinstance(value, str):
                # Remove non-ASCII characters using a comprehension
                dictionary[key] = "".join(char for char in value if ord(char) <= 127)
    return list_of_dicts

In [22]:
summary_dicts = remove_non_ascii_characters(summary_dicts)

In [25]:
# store dictionary to json file
import json

with open("summary_dicts.json", "w") as f:
    json.dump(summary_dicts, f)
    f.close()

## Generate sumamries for Balint

In [None]:
import json

from transformers import BartTokenizer, BartForConditionalGeneration

# Load the BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")


def remove_non_ascii(text):
    return "".join(char for char in text if ord(char) < 128)


def generate_and_store_summaries_with_articles(newts_train, k, tokenizer, model):
    # List to store dictionaries containing the article, two original summaries, and the generated summary
    data_to_store = []

    for i in range(k):
        article_text = newts_train.iloc[i]["article"]
        summary1 = newts_train.iloc[i]["summary1"]
        summary2 = newts_train.iloc[i]["summary2"]

        # Remove non-ASCII characters
        cleaned_article_text = remove_non_ascii(article_text)
        cleaned_summary1 = remove_non_ascii(summary1)
        cleaned_summary2 = remove_non_ascii(summary2)

        # Encode article
        input_ids = tokenizer(
            cleaned_article_text, return_tensors="pt", truncation=True, max_length=1024
        ).input_ids

        # Generate Summary Text Ids
        summary_text_ids = model.generate(
            input_ids=input_ids,
            bos_token_id=model.config.bos_token_id,
            eos_token_id=model.config.eos_token_id,
            length_penalty=2.0,
            max_length=142,
            min_length=56,
            num_beams=4,
            do_sample=True,
            top_k=k,
        )

        # Decode generated summary and remove non-ASCII characters
        generated_summary = remove_non_ascii(
            tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)
        )

        # Append to the list
        data_to_store.append(
            {
                "article": cleaned_article_text,
                "summary1": cleaned_summary1,
                "summary2": cleaned_summary2,
                "generated_summary": generated_summary,
            }
        )

    # Store in a json file
    with open("summaries_with_articles.json", "w", encoding="utf-8") as f:
        json.dump(data_to_store, f, ensure_ascii=False)

In [None]:
# Assuming newts_train is your DataFrame, tokenizer and model are loaded and configured
generate_and_store_summaries_with_articles(
    newts_train, k=50, tokenizer=bart_base_tokenizer, model=bart_base_model
)