## Prepare Nodes File: Topic Probability

In [21]:
import csv
import glob
import re
import os
import pandas as pd 

In [13]:
df = pd.read_csv('/First_Step/Prepare_Corpus/combine_president_speeches.csv')

In [14]:
df.shape

(1037, 5)

### Creating a file formatting for topic analysis (index\t label\t text)

In [117]:
# Directory containing the transcript files
transcript_directory = 'corpus'

# Get a list of all transcript file paths
transcript_files = [file for file in os.listdir(transcript_directory) if file.endswith('.txt')]

# Sort the transcript files in ascending order
transcript_files.sort()

# Open the output file to write the concatenated transcripts
output_file_path = 'concatenated_transcripts.txt'
with open(output_file_path, 'w') as output_file:
    for index, transcript_file in enumerate(transcript_files, start=1):
        transcript_file_path = os.path.join(transcript_directory, transcript_file)
        
        # Read the contents of the current transcript file
        with open(transcript_file_path, 'r') as file:
            transcript = file.read().strip()
        
        # Format and write the transcript to the output file
        output_file.write(f'{index}\t{transcript_file[:-4]}\t{transcript}\n')

### Topic Modelling

In [56]:
%pip install tomotopy little_mallet_wrapper

Note: you may need to restart the kernel to use updated packages.


In [58]:
import tomotopy as tp
import little_mallet_wrapper
import seaborn
from IPython.display import Markdown, display

def make_md(string):
    display(Markdown(str(string)))

def get_top_docs(docs, labels, ids, topic_distributions, topic_index, n=5):    
    sorted_data = sorted([(distribution[topic_index], document, label, docid) 
                          for distribution, document, label, docid
                          in zip(topic_distributions, docs, labels, ids)], reverse=True)
    topic_words = topics[topic_index]
    make_md(f"### ✨Topic {topic_index}✨\n\n{topic_words}\n\n---")
    for probability, doc, label, docid in sorted_data[:n]:
        # Make topic words bolded
        for word in topic_words.split():
            if word in doc.lower():
                doc = re.sub(f"\\b{word}\\b", f"**{word}**", doc, re.IGNORECASE)
        if len(doc) > 1000:
            doc = doc[:1000] + ' [...]'
        make_md(f'✨  \n**Topic Probability**: {probability:.3f}  **Label**: {label}  **ID**: {docid}\n\n**Document**: {doc}\n\n')

In [121]:
datasetfile = 'concatenated_transcripts.txt'

# To use a different stop words list, enable the following lines
# and the line with "stop_words" below:
stopwordsfile = 'english.txt'
with open(stopwordsfile) as inp:
    stop_words = inp.read().splitlines()

with open(datasetfile) as inp:
    lines = inp.read().splitlines()

    training_data = []
original_texts = []
labels = []
ids = []
for line in lines:
    try:
        docid, label, text = line.strip().split('\t', 2)
    except ValueError:
        continue
    processed_text = little_mallet_wrapper.process_string(
            text, numbers='remove',
             stop_words=stop_words,
            stop_words_extra=[])
    if not processed_text.strip():  # skip empty documents
        continue
    training_data.append(processed_text)
    original_texts.append(text)
    labels.append(label)
    ids.append(docid)

little_mallet_wrapper.print_dataset_stats(training_data)
len(training_data), len(original_texts), len(labels)

Number of Documents: 45
Mean Number of Words per Document: 31002.2
Vocabulary Size: 36582


(45, 45, 45)

In [128]:
# Number of topics to return
num_topics = 7
# Number of training iterations
iterations = 500

# Intialize the model
model = tp.LDAModel(k=num_topics)

# Add each document to the model, after splitting it up into words
for text in training_data:
    model.add_doc(text.strip().split())
    
print("Topic Model Training...\n\n")
for i in range(0, iterations + 1, 50):
    model.train(iterations)
    print(f'Iteration: {i}\tLog-likelihood: {model.ll_per_word}')
model.save('topicmodel.bin')
print('done.')

Topic Model Training...


Iteration: 0	Log-likelihood: -8.789014255449597
Iteration: 50	Log-likelihood: -8.813848984721211
Iteration: 100	Log-likelihood: -8.827444014806794
Iteration: 150	Log-likelihood: -8.840465850804966
Iteration: 200	Log-likelihood: -8.849922585769074
Iteration: 250	Log-likelihood: -8.854753045187387
Iteration: 300	Log-likelihood: -8.854315346654339
Iteration: 350	Log-likelihood: -8.853892841606706
Iteration: 400	Log-likelihood: -8.85204391833923
Iteration: 450	Log-likelihood: -8.852327837005355
Iteration: 500	Log-likelihood: -8.851268125977999
done.


In [129]:
model = tp.LDAModel.load('topicmodel.bin')
topic_distributions = [list(doc.get_topic_dist()) for doc in model.docs]

In [143]:
# Numer of topic words to print out
num_topic_words = 45

print("Topic Model Results:\n")
# Print out top 10 words for each topic
topics = []
topic_individual_words = []
for topic_number in range(0, num_topics):
    topic_words = ' '.join(word for word, prob in model.get_topic_words(topic_id=topic_number, top_n=num_topic_words))
    topics.append(topic_words)
    topic_individual_words.append(topic_words.split())
    print(f"✨Topic {topic_number}✨  {topic_words}")

Topic Model Results:

✨Topic 0✨  president peace people united war nations nation american time freedom country hope congress day program vietnam forces south military defense policy history government america administration days strength americans progress life economic countries national seek continue security house action europe months land north secretary meet asia
✨Topic 1✨  united government public congress country citizens power war powers time subject treaty laws duties peace mexico treasury people duty nations force foreign character revenue commerce executive period session consideration nation rights governments debt measures policy territory proper bank attention received british national protection condition relations
✨Topic 2✨  constitution people president government congress union question power law authority time principle federal war territory slavery country constitutional military south political north civil rights liberty laws day united passed senate control calle

### Creating the nodes file with topic probability 

In [131]:
# Put the topic weights into a DataFrame
# Columns are topics, rows are documents, each value is a topic weight.
columns = ['t%d: %s' % (topic_number,
        ' '.join(word for word, _prob in model.get_topic_words(topic_id=topic_number, top_n=4)))
    for topic_number in range(model.k)]
df = pd.DataFrame([doc.get_topic_dist() for doc in model.docs],
             index=labels,
             columns=columns)
df.head()

Unnamed: 0,t0: president peace people united,t1: united government public congress,t2: constitution people president government,t3: people america american americans,t4: people president america country,t5: united government congress law,t6: government business american national
Abraham Lincoln,0.05202,0.262301,0.578762,0.009129,0.006417,0.063305,0.028067
Andrew Jackson,0.007062,0.695537,0.198855,0.002098,0.000319,0.072337,0.023794
Andrew Johnson,0.013178,0.388892,0.430575,1e-05,3.6e-05,0.146191,0.021118
Barack Obama,0.128714,0.011916,0.029428,0.461172,0.325168,0.003503,0.040099
Benjamin Harrison,0.024009,0.234892,0.074279,0.002578,0.000841,0.552868,0.110532


In [132]:
df = df.rename_axis('ID')

In [134]:
df.to_csv('topic_probability.csv', index=True)