Basic topic modelling for just the contexts and their topics


- right now running it without removing non-words! (but am removing stop_words)

Both global_tuning and evolutionary_tuning are set to True as a default, but can easily be changed. Perhaps you do not want the representations to be influenced by the global representation and merely see how they evolved over time:


to do if i care
- remove non-words somehow (not worth the computational effort, probably) (also no good enough word list, unless using a tensorflow etc. to detect what are "likely" real words)
- maybe subset the topics for fact/fiction to be to 90s and after 90s (so it can plot them better)

FUNCTIONS ETC

In [1]:
import pandas as pd
from pathlib import Path
from bertopic import BERTopic
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from concurrent.futures import ThreadPoolExecutor, as_completed

stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def load_metadata(metadata_path):
    return pd.read_csv(metadata_path)

def process_file(txt_file, metadata_df):
    filename = txt_file.name
    file_date = metadata_df.loc[metadata_df['filename'] == filename.replace('.txt', '.pdf'), 'date'].values
    if file_date.size > 0:
        try:
            period = pd.Period(pd.to_datetime(file_date[0], format='%Y/%m/%d').strftime('%Y'), freq='Y')
            with open(txt_file, 'r', encoding='utf-8') as f:
                text = f.read()
            return text, period
        except (pd.errors.OutOfBoundsDatetime, ValueError):
            #print(f"Invalid date for file: {filename}")
            return None, None #for now this is fine because our data has NO fact/fiction before 1667
    else:
        print(f"Date not found for file: {filename}")
        return None, None

def load_texts_and_dates(folder_path, metadata_df):
    texts = []
    periods = []
    
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_file, txt_file, metadata_df): txt_file for txt_file in folder_path.glob('*.txt')}
        for future in as_completed(futures):
            text, period = future.result()
            if text is not None and period is not None:
                texts.append(text)
                periods.append(period)
    
    return texts, periods



In [2]:
def process_texts(folders):
    all_texts = []
    all_periods = []
    
    for key, (folder_path, metadata_path) in folders.items():
        metadata_df = load_metadata(metadata_path)
        texts, periods = load_texts_and_dates(folder_path, metadata_df)
        all_texts.extend(texts)
        all_periods.extend(periods)
        print(f"Processed {len(texts)} files from {folder_path}")
    
    period_strings = [period.strftime('%Y-%m-%d') for period in all_periods]
    processed_texts = [remove_stop_words(text) for text in all_texts]
    
    return processed_texts, period_strings

b_folders = {
    'rstb': (Path('D:/Fact_fiction_corpus/texts/royal society/txt_rstb'), 'D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstb.csv'),
}

l_folders = {
    'rstb': (Path('D:/Fact_fiction_corpus/texts/royal society/txt_rstl'), 'D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstl.csv'),
}

a_folders = {
    'rstb': (Path('D:/Fact_fiction_corpus/texts/royal society/txt_rsta'), 'D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rsta.csv'),
}

b_texts, b_periods = process_texts(b_folders)
l_texts, l_periods = process_texts(l_folders)
a_texts, a_periods = process_texts(a_folders)

FACT/FICTION CONTEXTS

In [3]:
import pandas as pd
from pathlib import Path
from bertopic import BERTopic

#contexts-only (kind of long though)
rs_data = pd.concat([pd.read_excel("../data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTB.xlsx"), pd.read_excel("../data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTL.xlsx"), pd.read_excel("../data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTA.xlsx")], ignore_index=True)
rs_data['Date'] = pd.to_datetime(rs_data['Date'], errors='coerce')
rs_data = rs_data.dropna(subset=['Date'])

# Fact
fact_data = rs_data[rs_data['Keyword'].str.contains('act', case=False, na=False)]
date_fact = fact_data['Date'].tolist()
contexts_fact = fact_data['Context'].to_list()

# Fiction
fiction_data = rs_data[rs_data['Keyword'].str.contains('ict', case=False, na=False)]
date_fict = fiction_data['Date'].tolist()
contexts_fict = fiction_data['Context'].to_list()

fict_data_clean = [remove_stop_words(text) for text in contexts_fict]
fact_data_clean = [remove_stop_words(text) for text in contexts_fact]

In [7]:
#fact/fict with no stop words

topic_model = BERTopic(verbose=False)
topics, probs = topic_model.fit_transform(fact_data_clean)
topics_over_time1 = topic_model.topics_over_time(fact_data_clean, date_fact, datetime_format="%Y-%m-%d", nr_bins=50)
fig1 = topic_model.visualize_topics_over_time(topics_over_time1, top_n_topics=30, title= "Topics over time in Fact words found in Royal Society Transactions (RSTL, RSTB, RSTA)")
fig1.write_html('fact_topics_over_time_50_30.html')

fig1.show()

topic_model = BERTopic(verbose=False)
topics, probs = topic_model.fit_transform(fict_data_clean)
topics_over_time2 = topic_model.topics_over_time(fict_data_clean, date_fict, datetime_format="%Y-%m-%d", nr_bins=50)
fig2 = topic_model.visualize_topics_over_time(topics_over_time2, top_n_topics=30, title= "Topics over time in Fiction words found in Royal Society Transactions (RSTL, RSTB, RSTA)")
fig2.write_html('fiction_topics_over_time_50_30.html')

fig2.show()

THE TOPIC OF ALL FULL THE ROYAL SOCIETY ARTICLES?

In [33]:
#all rs combined
all_texts_combined = b_texts + l_texts + a_texts
all_periods_combined = b_periods + l_periods + a_periods
period_strings_combined = [period for period in all_periods_combined]

topic_model = BERTopic(verbose=False)
topics, probs = topic_model.fit_transform(all_texts_combined)

topics_over_time = topic_model.topics_over_time(all_texts_combined, period_strings_combined, datetime_format="%Y-%m-%d", nr_bins=50)
fig4 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=30, title="Topics over time in Fact and Fiction texts (RSTL, RSTB, RSTA)")

fig4.write_html('full_rs_topics_over_time_periods_50_30.html')
fig4.show()

In [32]:
#periods rstl rstb
import pandas as pd
from pathlib import Path
from bertopic import BERTopic

texts_combined = b_texts + l_texts 
periods_combined = b_periods + l_periods
period_strings_combined = [period for period in all_periods_combined]

topic_model = BERTopic(verbose=False)
topics, probs = topic_model.fit_transform(texts_combined)

topics_over_time = topic_model.topics_over_time(texts_combined, periods_combined, datetime_format="%Y-%m-%d", nr_bins=50)
fig5 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=30, title="Topics over time in Fact and Fiction texts (RSTL, RSTB)")
fig5.write_html('rstbl_topics_over_time_periods_50_30.html')

fig5.show()

In [31]:
#periods rsta
import pandas as pd
from pathlib import Path
from bertopic import BERTopic

topic_model = BERTopic(verbose=False)
topics, probs = topic_model.fit_transform(a_texts)

topics_over_time = topic_model.topics_over_time(a_texts, a_periods, datetime_format="%Y-%m-%d", nr_bins=50)
fig6 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=30, title="Topics over time in Fact and Fiction texts (RSTA)")
fig6.write_html('rsta_topics_over_time_periods_50_30.html')

fig6.show()