2. Fine-tuning BERTopic

In [70]:
from bertopic import BERTopic
from umap import UMAP
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns


In [58]:
# Read CSV File
df = pd.read_csv("/Users/gresasmolica/Downloads/processed_data_congress.csv")

# keep only the text column

df = df[['processed_text_v2', 'congress', 'period']]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   processed_text_v2  52 non-null     object
 1   congress           52 non-null     object
 2   period             52 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


In [59]:
umap_model = UMAP(
    n_neighbors=10,
    n_components=8,
    metric='cosine',
    min_dist=0.0,
    random_state=42
)

topic_model = BERTopic(
    language="english",
    umap_model=umap_model,
    calculate_probabilities=True,
    min_topic_size=10,
    nr_topics=5,
    top_n_words=10,
    n_gram_range=(1, 2)
)

topics, probabilities = topic_model.fit_transform(df['processed_text_v2'])

In [60]:
# get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,11,0_mr_thank_think_know,"[mr, thank, think, know, nato, would, russia, ...","[['subcommittee', 'met', 'pursuant', 'notice',..."
1,1,13,1_mr_ukraine_thank_chairman,"[mr, ukraine, thank, chairman, would, think, k...","[['committee', 'met', 'pursuant', 'notice', '1..."
2,2,28,2_state_ukraine_russia_senator,"[state, ukraine, russia, senator, would, count...","[['opening', 'statement', 'hon', 'robert', 'me..."


In [61]:
topic_model.visualize_barchart(top_n_topics=5)

In [62]:
topic_model.visualize_term_rank()

In [63]:
topic_model.visualize_hierarchy()

In [64]:
# split the data for house, senate, and joint and search for topics

df_house = df[df['congress'] == 'house']
df_senate = df[df['congress'] == 'senate']
df_joint = df[df['congress'] == 'joint']

In [68]:
# Define a function for topic modeling
def perform_topic_modeling(data, title):
    umap_model = UMAP(
        n_neighbors=10,
        n_components=8,
        metric='cosine',
        min_dist=0.0,
        random_state=42
    )

    topic_model = BERTopic(
        language="english",
        umap_model=umap_model,
        calculate_probabilities=True,
        min_topic_size=10,
        nr_topics=5,
        top_n_words=10,
        n_gram_range=(1, 2)
    )

    topics, probabilities = topic_model.fit_transform(data['processed_text_v2'])

    # Print or visualize results as needed
    print(f"Topics for {title} sessions:")
    print(topic_model.get_topic_info())

# Perform topic modeling for each session type
perform_topic_modeling(df_senate, "Senate")
perform_topic_modeling(df_house, "House")
# perform_topic_modeling(df_joint, "Joint")

Topics for Senate sessions:
   Topic  Count                             Name  \
0     -1     19  -1_state_russia_ukraine_senator   

                                      Representation  \
0  [state, russia, ukraine, senator, country, que...   

                                 Representative_Docs  
0  [['opening', 'statement', 'hon', 'robert', 'me...  
Topics for House sessions:
   Topic  Count                       Name  \
0     -1     26  -1_mr_thank_think_ukraine   

                                      Representation  \
0  [mr, thank, think, ukraine, know, state, would...   

                                 Representative_Docs  
0  [['committee', 'met', 'pursuant', 'notice', '1...  
