# Create topic strings
For each topic create multiple strings that encode the topic well

In [2]:
import sys
sys.path.append('../..')

In [3]:
from utils.evaluation_utils import get_topic_words
from utils.read_and_load_utils import load_lda
from config import TOPICS_CONFIG

## Load the LDA model

In [4]:
lda = load_lda()
# Warning "WARNING:root:random_state not set so using default value" is inconsequential for inference

2024-05-16 11:36:06,594 - INFO - Data directory found at /Users/joschka/Documents/0_Studium/0_ML_Master/0_current/nlp_research_paper/topical-decoding/topical_decoding/data.
2024-05-16 11:36:06,594 - INFO - loading LdaModel object from /Users/joschka/Documents/0_Studium/0_ML_Master/0_current/nlp_research_paper/topical-decoding/topical_decoding/data/LDA_250/lda.model
2024-05-16 11:36:06,705 - INFO - loading id2word recursively from /Users/joschka/Documents/0_Studium/0_ML_Master/0_current/nlp_research_paper/topical-decoding/topical_decoding/data/LDA_250/lda.model.id2word.* with mmap=r
2024-05-16 11:36:06,705 - INFO - loading expElogbeta from /Users/joschka/Documents/0_Studium/0_ML_Master/0_current/nlp_research_paper/topical-decoding/topical_decoding/data/LDA_250/lda.model.expElogbeta.npy with mmap=r
2024-05-16 11:36:06,712 - INFO - setting ignored attribute state to None
2024-05-16 11:36:06,712 - INFO - setting ignored attribute dispatcher to None
2024-05-16 11:36:06,712 - INFO - LdaModel

## Load the NEWTS dataset

In [5]:
from utils.read_and_load_utils import read_dataset

# Load the NEWTS dataset
newts_train = read_dataset("newts_train")

2024-05-16 11:36:06,719 - INFO - Data directory found at /Users/joschka/Documents/0_Studium/0_ML_Master/0_current/nlp_research_paper/topical-decoding/topical_decoding/data.
2024-05-16 11:36:06,789 - INFO - Successfully loaded NEWTS_train_2400 dataset.


In [6]:
newts_train.head(3)

Unnamed: 0,article_idx,AssignmentId,docId,article,tid1,tid2,words1,words2,phrases1,phrases2,sentences1,sentences2,summary1,summary2
0,0,3EG49X351WE8VLP4S0TIYZF3V476X2,094372190d52acbce61a73ec16b2217d1a60276f,The president of the World Bank on Saturday wa...,175,110,"house, committee, congress, senate, republican...","billion, figures, economy, global, growth, eco...","senate and congress, congressional pressure, y...","economic growth, global growth, billion dollar...","This topic is about the senate and congress, c...",This topic is about economic growth involving ...,The leader of the World Bank urged the US to t...,The US economy will be a driving factor in the...
1,1,3DOCMVPBTPGBQCHSPBSQ28AROFXNNI,bc733fb96fd73496e10fcff3c640ee11c4df3d7a,By . Nick Harris . Manchester City are the bes...,152,217,"united, manchester, liverpool, chelsea, league...","club, team, season, players, england, football...","Manchester United's manager, Premier League, t...","football league, the team's fans, football pla...",This topic is about Manchester United's manage...,This topic is about a football league having a...,Premier league is the most paying football lea...,Manchester city players earn the largest amoun...
2,2,3QHK8ZVMIOKJ13PAA872YL681IABLF,4ca225c38cc4a743e559efd586b99f162604ff16,Iran's military on Sunday claimed it shot down...,64,134,"group, forces, fighters, killed, fighting, mil...","air, plane, aircraft, flight, flying, pilot, f...","group composed of militants, fighters joining ...","flying the plane, the pilot of this aircraft, ...",This topic is about a group composed of milita...,"This topic is about flying the plane, the pilo...",Iran's military claimed it shot down a U.S. dr...,Iran's military claims shooting down an Americ...


In [7]:
newts_train['phrases1'][0]

"senate and congress, congressional pressure, you can call your representative's office, told a Senate committee, lawmakers setting the record straight, staffer to the Democratic senator, federal employee benefits"

In [8]:
newts_train['sentences1'][0]

"This topic is about the senate and congress, congressional pressure, calling one's representative's office, informing a Senate committee, lawmakers setting the record straight, the staffer to the Democratic senator, and federal employee benefits."

In [9]:
# number of different topics (tid1 or tid2) in the dataset
relevant_tids = set(map(int, newts_train['tid1'].unique())).union(set(map(int, newts_train['tid2'].unique())))
print("Number of different topics in the dataset: ", len(relevant_tids))
print("Topic IDs: ", relevant_tids)

Number of different topics in the dataset:  50
Topic IDs:  {12, 13, 32, 39, 46, 48, 55, 61, 62, 64, 72, 78, 83, 85, 89, 90, 97, 100, 101, 105, 110, 113, 115, 128, 129, 134, 144, 152, 153, 162, 163, 175, 180, 187, 194, 195, 196, 198, 199, 200, 205, 211, 217, 218, 227, 229, 236, 245, 247, 248}


## Selecting topic strings
For each of the 50 topics that appead in the NEWTS train dataset, create topic strings.
As a datastructure we use a dictionary and use the relevant_tids as the keys.

In [10]:
topic_strings = dict()
for tid in relevant_tids:
    topic_strings[tid] = dict()

### Topic string: topic words

In [11]:
num_topic_words = TOPICS_CONFIG['num_topic_words']

In [60]:
for tid in relevant_tids:
    topic_words = get_topic_words(lda=lda, tid=tid, num_topic_words=num_topic_words)
    # concatenate the words into a single string
    topic_strings[tid]['topic_words'] = " ".join(topic_words)

In [61]:
print("Example of topic words for topic 175: ", topic_strings[175]['topic_words'])

Example of topic words for topic 175:  house committee congress senate republican republicans senator rep federal democrats sen reid chamber democratic capitol government congressional lawmakers gop democrat john vets chairman members reform


### Topic string: topic phrases

In [62]:
# while not all relevant_tids have been seen, iterate over the dataset and use the phrases as topic_strings[tid][topic_phrases]
missing_tids = set(relevant_tids)
for index, row in newts_train.iterrows():
    tid1 = row['tid1']
    tid2 = row['tid2']
    if tid1 in missing_tids:
        topic_strings[tid1]['topic_phrases'] = row['phrases1']
        missing_tids.remove(tid1)
    if tid2 in missing_tids:
        topic_strings[tid2]['topic_phrases'] = row['phrases2']
        missing_tids.remove(tid2)
    if len(missing_tids) == 0:
        break

In [63]:
print("Example of topic phrases for topic 175: ", topic_strings[175]['topic_phrases'])

Example of topic phrases for topic 175:  senate and congress, congressional pressure, you can call your representative's office, told a Senate committee, lawmakers setting the record straight, staffer to the Democratic senator, federal employee benefits


### Topic string: topic description

In [64]:
# while not all relevant_tids have been seen, iterate over the dataset and use the sentences as topic_strings[tid][topic_description]
missing_tids = set(relevant_tids)
for index, row in newts_train.iterrows():
    tid1 = row['tid1']
    tid2 = row['tid2']
    if tid1 in missing_tids:
        topic_strings[tid1]['topic_description'] = row['sentences1']
        missing_tids.remove(tid1)
    if tid2 in missing_tids:
        topic_strings[tid2]['topic_description'] = row['sentences2']
        missing_tids.remove(tid2)
    if len(missing_tids) == 0:
        break

In [65]:
print("Example of topic description for topic 175: ", topic_strings[175]['topic_description'])

Example of topic description for topic 175:  This topic is about the senate and congress, congressional pressure, calling one's representative's office, informing a Senate committee, lawmakers setting the record straight, the staffer to the Democratic senator, and federal employee benefits.


In [66]:
# Save the topic strings to a file under the data folder in the topic_vectors_data folder
# the data folder is two levels up from the current folder
import os
import json

with open('../../data/topic_vectors_data/topic_strings.json', 'w', encoding='utf-8') as f:
    json.dump(topic_strings, f, indent=4)

## Use topic focussed summaries to generate topic vectors
Instead of using direct descriptions of the topic, use the summaries that focus on the topic.


In [13]:
# for each relevant tid, count the number of summaries that focus on that topic. 
# Use the tid1 and tid2 columns to count the number of summaries that focus on that topic.
topic_counts = dict()
for tid in relevant_tids:
    topic_counts[tid] = 0

for index, row in newts_train.iterrows():
    tid1 = row['tid1']
    tid2 = row['tid2']
    topic_counts[tid1] += 1
    topic_counts[tid2] += 1

print("Number of summaries that focus on each topic: ", topic_counts)
print("Min and max number of summaries that focus on a topic: ", min(topic_counts.values()), max(topic_counts.values()))


Number of summaries that focus on each topic:  {12: 49, 13: 70, 32: 75, 39: 158, 46: 87, 48: 106, 55: 74, 61: 116, 62: 69, 64: 94, 72: 56, 78: 82, 83: 76, 85: 56, 89: 73, 90: 26, 97: 44, 100: 62, 101: 41, 105: 167, 110: 109, 113: 135, 115: 191, 128: 135, 129: 60, 134: 129, 144: 133, 152: 110, 153: 65, 162: 217, 163: 89, 175: 92, 180: 83, 187: 85, 194: 107, 195: 182, 196: 94, 198: 69, 199: 55, 200: 107, 205: 35, 211: 76, 217: 201, 218: 97, 227: 48, 229: 55, 236: 95, 245: 75, 247: 187, 248: 103}
Min and max number of summaries that focus on a topic:  26 217
