# Create topic strings
For each topic create multiple strings that encode the topic well

In [9]:
import sys
sys.path.append('../..')

In [10]:
from src.utils.evaluation_utils import get_topic_words
from src.utils.read_and_load_utils import load_lda
from config import TOPICS_CONFIG

ModuleNotFoundError: No module named 'evaluate'

## Load the LDA model

In [None]:
lda = load_lda()
# Warning "WARNING:root:random_state not set so using default value" is inconsequential for inference

## Load the NEWTS dataset

In [None]:
from utils.read_and_load_utils import read_dataset

# Load the NEWTS dataset
newts_train = read_dataset("newts_train")

In [None]:
newts_train.head(3)

In [None]:
newts_train['phrases1'][0]

In [None]:
newts_train['sentences1'][0]

In [None]:
# number of different topics (tid1 or tid2) in the dataset
relevant_tids = set(map(int, newts_train['tid1'].unique())).union(set(map(int, newts_train['tid2'].unique())))
print("Number of different topics in the dataset: ", len(relevant_tids))
print("Topic IDs: ", relevant_tids)

## Selecting topic strings
For each of the 50 topics that appead in the NEWTS train dataset, create topic strings.
As a datastructure we use a dictionary and use the relevant_tids as the keys.

In [10]:
topic_strings = dict()
for tid in relevant_tids:
    topic_strings[tid] = dict()

### Topic string: topic words

In [11]:
num_topic_words = TOPICS_CONFIG['num_topic_words']

In [60]:
for tid in relevant_tids:
    topic_words = get_topic_words(lda=lda, tid=tid, num_topic_words=num_topic_words)
    # concatenate the words into a single string
    topic_strings[tid]['topic_words'] = " ".join(topic_words)

In [None]:
print("Example of topic words for topic 175: ", topic_strings[175]['topic_words'])

### Topic string: topic phrases

In [62]:
# while not all relevant_tids have been seen, iterate over the dataset and use the phrases as topic_strings[tid][topic_phrases]
missing_tids = set(relevant_tids)
for index, row in newts_train.iterrows():
    tid1 = row['tid1']
    tid2 = row['tid2']
    if tid1 in missing_tids:
        topic_strings[tid1]['topic_phrases'] = row['phrases1']
        missing_tids.remove(tid1)
    if tid2 in missing_tids:
        topic_strings[tid2]['topic_phrases'] = row['phrases2']
        missing_tids.remove(tid2)
    if len(missing_tids) == 0:
        break

In [None]:
print("Example of topic phrases for topic 175: ", topic_strings[175]['topic_phrases'])

### Topic string: topic description

In [64]:
# while not all relevant_tids have been seen, iterate over the dataset and use the sentences as topic_strings[tid][topic_description]
missing_tids = set(relevant_tids)
for index, row in newts_train.iterrows():
    tid1 = row['tid1']
    tid2 = row['tid2']
    if tid1 in missing_tids:
        topic_strings[tid1]['topic_description'] = row['sentences1']
        missing_tids.remove(tid1)
    if tid2 in missing_tids:
        topic_strings[tid2]['topic_description'] = row['sentences2']
        missing_tids.remove(tid2)
    if len(missing_tids) == 0:
        break

In [None]:
print("Example of topic description for topic 175: ", topic_strings[175]['topic_description'])

In [66]:
# Save the topic strings to a file under the data folder in the topic_vectors_data folder
# the data folder is two levels up from the current folder
import os
import json

with open('../../data/topic_vectors_data/topic_strings.json', 'w', encoding='utf-8') as f:
    json.dump(topic_strings, f, indent=4)

## Use topic focussed summaries to generate topic vectors
Instead of using direct descriptions of the topic, use the summaries that focus on the topic.


In [None]:
# for each relevant tid, count the number of summaries that focus on that topic. 
# Use the tid1 and tid2 columns to count the number of summaries that focus on that topic.
topic_counts = dict()
for tid in relevant_tids:
    topic_counts[tid] = 0

for index, row in newts_train.iterrows():
    tid1 = row['tid1']
    tid2 = row['tid2']
    topic_counts[tid1] += 1
    topic_counts[tid2] += 1

print("Number of summaries that focus on each topic: ", topic_counts)
print("Min and max number of summaries that focus on a topic: ", min(topic_counts.values()), max(topic_counts.values()))
