Basic topic modelling for just the contexts and their topics

Both global_tuning and evolutionary_tuning are set to True as a default, but can easily be changed. Perhaps you do not want the representations to be influenced by the global representation and merely see how they evolved over time:

In [29]:
import pandas as pd
from pathlib import Path
from bertopic import BERTopic

rs_data = pd.concat([pd.read_excel("../data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTB.xlsx"), pd.read_excel("../data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTL.xlsx")], ignore_index=True)
rs_data['Date'] = pd.to_datetime(rs_data['Date'], errors='coerce')
rs_data = rs_data.dropna(subset=['Date'])

# Fact
fact_data = rs_data[rs_data['Keyword'].str.contains('act', case=False, na=False)]
date_fact = fact_data['Date'].tolist()
contexts_fact = fact_data['Context'].to_list()

# Fiction
fiction_data = rs_data[rs_data['Keyword'].str.contains('ict', case=False, na=False)]
date_fict = fiction_data['Date'].tolist()
contexts_fict = fiction_data['Context'].to_list()

In [30]:
# Fact
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(contexts_fact)
topics_over_time = topic_model.topics_over_time(contexts_fact, date_fact, datetime_format="%Y-%m-%d", nr_bins=30)
fig1 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20, title= "Topics over time in Fact words found in Royal Society Transactions")
fig1.write_html('fact_topics_over_time_30.html')

fig1.show()

#topic_model.get_topic_info()

2024-09-13 12:10:37,142 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/807 [00:00<?, ?it/s]

2024-09-13 12:11:25,873 - BERTopic - Embedding - Completed ✓
2024-09-13 12:11:25,874 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-13 12:11:35,138 - BERTopic - Dimensionality - Completed ✓
2024-09-13 12:11:35,141 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-13 12:11:39,577 - BERTopic - Cluster - Completed ✓
2024-09-13 12:11:39,585 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-13 12:11:44,217 - BERTopic - Representation - Completed ✓
29it [00:21,  1.32it/s]


In [31]:
# Fiction --meaningless due to low amount?
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(contexts_fict)
topics_over_time = topic_model.topics_over_time(contexts_fict, date_fict, datetime_format="%Y-%m-%d", nr_bins=30)
fig2 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20, title= "Topics over time in Fiction words found in Royal Society Transactions")
fig2.write_html('fiction_topics_over_time_30.html')

fig2.show()

2024-09-13 12:12:10,790 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2024-09-13 12:12:13,354 - BERTopic - Embedding - Completed ✓
2024-09-13 12:12:13,356 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-13 12:12:15,371 - BERTopic - Dimensionality - Completed ✓
2024-09-13 12:12:15,371 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-13 12:12:15,377 - BERTopic - Cluster - Completed ✓
2024-09-13 12:12:15,380 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-13 12:12:15,439 - BERTopic - Representation - Completed ✓
24it [00:00, 78.19it/s]


THE TOPIC OF ALL FULL THE ROYAL SOCIETY ARTICLES?

In [32]:
#datetime
#maybe switch to int for dates

import pandas as pd
from pathlib import Path
from bertopic import BERTopic

folders = {
    'rstb': (Path('D:/Fact_fiction_corpus/texts/royal society/txt_rstb'), 'D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstb.csv'),
    'rstl': (Path('D:/Fact_fiction_corpus/texts/royal society/txt_rstl'), 'D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstl.csv')
}

def load_metadata(metadata_path):
    return pd.read_csv(metadata_path)

def load_texts_and_dates(folder_path, metadata_df):
    texts = []
    dates = []
    for txt_file in folder_path.glob('*.txt'):
        filename = txt_file.name
        file_date = metadata_df.loc[metadata_df['filename'] == filename.replace('.txt', '.pdf'), 'date'].values
        if file_date.size > 0:
            with open(txt_file, 'r', encoding='utf-8') as f:
                texts.append(f.read())
                dates.append(file_date[0])
            print(f"Processed file: {filename}")
        else:
            print(f"Date not found for file: {filename}")
    return texts, dates

all_texts = []
all_dates = []
for key, (folder_path, metadata_path) in folders.items():
    metadata_df = load_metadata(metadata_path)
    texts, dates = load_texts_and_dates(folder_path, metadata_df)
    all_texts.extend(texts)
    all_dates.extend(dates)
    print(f"Processed {len(texts)} files from {folder_path}")

def safe_to_datetime(date_list, format_str):
    valid_dates = []
    for date_str in date_list:
        try:
            valid_dates.append(pd.to_datetime(date_str, format=format_str))
        except (pd.errors.OutOfBoundsDatetime, ValueError):
            print(f"Invalid date encountered: {date_str}")
            valid_dates.append(pd.NaT)
    return valid_dates

# Use the safe_to_datetime function with the correct format
all_dates = safe_to_datetime(all_dates, '%Y/%m/%d')
all_dates = [date.strftime('%Y-%m-%d') if not pd.isna(date) else '1680-01-01' for date in all_dates] #certain dates are incompatible cuz bits

# Perform topic modeling
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(all_texts)
topics_over_time = topic_model.topics_over_time(all_texts, all_dates, datetime_format="%Y-%m-%d", nr_bins=30) #20 works ok, might have to split the dataset into 2 and plot it separately (cuz big gap after 1950)
fig3 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20, title="Topics over time in Fact and Fiction texts")
fig3.write_html('full_rs_topics_over_time_dates_30.html')

fig3.show()

Processed file: rstb1990_355_1397_549.txt
Processed file: rstb_1887_0001.txt
Processed file: rstb_1887_0007.txt
Processed file: rstb_1887_0013.txt
Processed file: rstb_1887_0008.txt
Processed file: rstb_1887_0006.txt
Processed file: rstb_1887_0012.txt
Processed file: rstb_1887_0003.txt
Processed file: rstb_1887_0015.txt
Processed file: rstb_1887_0009.txt
Processed file: rstb_1887_0005.txt
Processed file: rstb_1887_0011.txt
Processed file: rstb_1887_0010.txt
Processed file: rstb_1887_0002.txt
Processed file: rstb_1887_0016.txt
Processed file: rstb_1888_0002.txt
Processed file: rstb_1887_0018.txt
Processed file: rstb_1888_0005.txt
Processed file: rstb_1887_0004.txt
Processed file: rstb_1888_0008.txt
Processed file: rstb_1888_0007.txt
Processed file: rstb_1888_0006.txt
Processed file: rstb_1888_0004.txt
Processed file: rstb_1887_0017.txt
Processed file: rstb_1888_0014.txt
Processed file: rstb_1888_0001.txt
Processed file: rstb_1888_0013.txt
Processed file: rstb_1888_0011.txt
Processed fil

2024-09-13 12:15:05,456 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/432 [00:00<?, ?it/s]

2024-09-13 12:18:32,077 - BERTopic - Embedding - Completed ✓
2024-09-13 12:18:32,077 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-13 12:18:40,101 - BERTopic - Dimensionality - Completed ✓
2024-09-13 12:18:40,104 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-13 12:18:41,040 - BERTopic - Cluster - Completed ✓
2024-09-13 12:18:41,055 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-13 12:20:04,434 - BERTopic - Representation - Completed ✓
30it [13:49, 27.66s/it]


In [33]:
import pandas as pd
from pathlib import Path
from bertopic import BERTopic

folders = {
    'rstb': (Path('D:/Fact_fiction_corpus/texts/royal society/txt_rstb'), 'D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstb.csv'),
    'rstl': (Path('D:/Fact_fiction_corpus/texts/royal society/txt_rstl'), 'D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstl.csv')
}

def load_metadata(metadata_path):
    return pd.read_csv(metadata_path)

def load_texts_and_dates(folder_path, metadata_df):
    texts = []
    periods = []
    for txt_file in folder_path.glob('*.txt'):
        filename = txt_file.name
        file_date = metadata_df.loc[metadata_df['filename'] == filename.replace('.txt', '.pdf'), 'date'].values
        if file_date.size > 0:
            try:
                period = pd.Period(pd.to_datetime(file_date[0], format='%Y/%m/%d').strftime('%Y'), freq='Y')
                with open(txt_file, 'r', encoding='utf-8') as f:
                    texts.append(f.read())
                    periods.append(period)
                print(f"Processed file: {filename}")
            except (pd.errors.OutOfBoundsDatetime, ValueError):
                print(f"Invalid date for file: {filename}")
        else:
            print(f"Date not found for file: {filename}")
    return texts, periods

all_texts = []
all_periods = []
for key, (folder_path, metadata_path) in folders.items():
    metadata_df = load_metadata(metadata_path)
    texts, periods = load_texts_and_dates(folder_path, metadata_df)
    all_texts.extend(texts)
    all_periods.extend(periods)
    print(f"Processed {len(texts)} files from {folder_path}")

# Convert periods to a format suitable for BERTopic (e.g., year strings)
period_strings = [period.strftime('%Y') for period in all_periods]

topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(all_texts)

# Ensure period strings are properly formatted
period_strings = [period.strftime('%Y-%m-%d') for period in all_periods]

# Perform topic modeling over time
topics_over_time = topic_model.topics_over_time(all_texts, period_strings, datetime_format="%Y-%m-%d", nr_bins=30)
fig4 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20, title="Topics over time in Fact and Fiction texts")
fig4.write_html('full_rs_topics_over_time_periods_30.html')

fig4.show()

Processed file: rstb1990_355_1397_549.txt
Processed file: rstb_1887_0001.txt
Processed file: rstb_1887_0007.txt
Processed file: rstb_1887_0013.txt
Processed file: rstb_1887_0008.txt
Processed file: rstb_1887_0006.txt
Processed file: rstb_1887_0012.txt
Processed file: rstb_1887_0003.txt
Processed file: rstb_1887_0015.txt
Processed file: rstb_1887_0009.txt
Processed file: rstb_1887_0005.txt
Processed file: rstb_1887_0011.txt
Processed file: rstb_1887_0010.txt
Processed file: rstb_1887_0002.txt
Processed file: rstb_1887_0016.txt
Processed file: rstb_1888_0002.txt
Processed file: rstb_1887_0018.txt
Processed file: rstb_1888_0005.txt
Processed file: rstb_1887_0004.txt
Processed file: rstb_1888_0008.txt
Processed file: rstb_1888_0007.txt
Processed file: rstb_1888_0006.txt
Processed file: rstb_1888_0004.txt
Processed file: rstb_1887_0017.txt
Processed file: rstb_1888_0014.txt
Processed file: rstb_1888_0001.txt
Processed file: rstb_1888_0013.txt
Processed file: rstb_1888_0011.txt
Processed fil

2024-09-13 12:35:11,503 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/405 [00:00<?, ?it/s]

2024-09-13 12:39:49,777 - BERTopic - Embedding - Completed ✓
2024-09-13 12:39:49,780 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-13 12:39:57,244 - BERTopic - Dimensionality - Completed ✓
2024-09-13 12:39:57,248 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-09-13 12:39:57,855 - BERTopic - Cluster - Completed ✓
2024-09-13 12:39:57,868 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-09-13 12:41:23,534 - BERTopic - Representation - Completed ✓
30it [15:55, 31.86s/it]
