<a href="https://colab.research.google.com/github/MariaFernandaOrtega/Topic-Modelling-Tutorial/blob/main/Bertopic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install bertopic
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m92.2/154.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/

In [2]:
# Try to import BERTopic
from bertopic import BERTopic

In [4]:
# Install older version of joblib
!pip install --upgrade joblib==1.1.0



In [20]:
# Data processing
import pandas as pd
import numpy as np
import ast

# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap import UMAP

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [121]:
df = pd.read_csv("processed_data_congress.csv")

# keep only the text column

df = df[['processed_text_v2', 'congress', 'period']]


In [122]:
# Remove specified words from the 'processed_text_v2' column
words_to_remove = {'mr', 'think', 'thank', 'would', 'know', 'senator', "chairman", "also", "question", "country", "state"}

# Define a function to process the text within the lists
def process_text_list(text_list_str, words_to_remove):
    # Convert the string representation of a list into an actual list
    word_list = ast.literal_eval(text_list_str)
    # Remove the specified words and join the remaining words into a single string
    return ' '.join([word for word in word_list if word not in words_to_remove])

# Apply this function to the 'processed_text_v2' column
df['processed_text_v2'] = df['processed_text_v2'].apply(lambda x: process_text_list(x, words_to_remove))

# Display the first few rows after modification
df['processed_text_v2'].head()

0    opening statement hon joe wilson u house south...
1    opening statement joe wilson u house south car...
2    opening statement joe wilson u house south car...
3    opening statement hon joe wilson u house south...
4    opening statement hon steve cohen ranking memb...
Name: processed_text_v2, dtype: object

In [123]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re

# Ensure you have the necessary NLTK data
nltk.download('punkt')



# Define a function to lemmatize specific words and remove numbers
def lemmatize_and_remove_numbers(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Lemmatize specific words and remove numbers
    lemmatized_tokens = []
    for token in tokens:
        if token.lower() in ['europe', 'european']:
            lemmatized_tokens.append('Europe')
        elif token.lower() in ['russia', 'russian']:
            lemmatized_tokens.append('Russia')
        elif not re.fullmatch(r'\d+', token):  # Regex to remove numbers
            lemmatized_tokens.append(token)

    # Reconstruct the text
    return ' '.join(lemmatized_tokens)

# Apply the function to the 'processed_text_v2' column
df['processed_text_v2'] = df['processed_text_v2'].apply(lemmatize_and_remove_numbers)

# Save the modified dataframe
df


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,processed_text_v2,congress,period
0,opening statement hon joe wilson u house south...,joint,2023-2024
1,opening statement joe wilson u house south car...,joint,2023-2024
2,opening statement joe wilson u house south car...,joint,2023-2024
3,opening statement hon joe wilson u house south...,joint,2023-2024
4,opening statement hon steve cohen ranking memb...,joint,2023-2024
5,opening statement hon jeanne shaheen u new ham...,senate,2023-2024
6,opening statement hon robert menendez u new je...,senate,2023-2024
7,opening statement hon benjamin l cardin u mary...,senate,2023-2024
8,opening statement hon robert menendez u new je...,senate,2023-2024
9,opening statement hon robert menendez u new je...,senate,2023-2024


Senate (2023-2024)

In [124]:
# Create a subset of the data where 'congress' column is 'senate' and 'period' is '2022-2023'
senate_subset = df[(df['congress'] == 'senate')] #& (df['period'] == '2023-2024')]

senate_subset

Unnamed: 0,processed_text_v2,congress,period
5,opening statement hon jeanne shaheen u new ham...,senate,2023-2024
6,opening statement hon robert menendez u new je...,senate,2023-2024
7,opening statement hon benjamin l cardin u mary...,senate,2023-2024
8,opening statement hon robert menendez u new je...,senate,2023-2024
9,opening statement hon robert menendez u new je...,senate,2023-2024
10,opening statement hon robert menendez u new je...,senate,2023-2024
11,opening statement hon robert menendez u new je...,senate,2023-2024
12,opening statement hon robert menendez u new je...,senate,2023-2024
13,opening statement hon robert menendez u new je...,senate,2023-2024
14,opening statement hon robert menendez u new je...,senate,2023-2024


In [125]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=5,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)

# Run BERTopic model
topics, probabilities = topic_model.fit_transform(senate_subset['processed_text_v2'])

In [126]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19,-1_russia_ukraine_support_united,"[russia, ukraine, support, united, europe, sec...",[opening statement hon robert menendez u new j...


House 2023-2024

In [127]:
# Create a subset of the data where 'congress' column is 'senate' and 'period' is '2022-2023'
house_subset = df[(df['congress'] == 'house')] #& (df['period'] == '2023-2024')]

house_subset

Unnamed: 0,processed_text_v2,congress,period
15,committee met pursuant notice room house visit...,house,2023-2024
16,committee met pursuant notice room house visit...,house,2023-2024
17,subcommittee met pursuant notice pm room house...,house,2023-2024
18,committee met pursuant notice room house visit...,house,2023-2024
19,subcommittee met pursuant notice pm room house...,house,2023-2024
20,subcommittee met pursuant notice pm room house...,house,2023-2024
21,opening statement hon joe wilson u house south...,house,2023-2024
22,committee met pursuant notice room house visit...,house,2023-2024
23,opening statement hon ben cardin cochairman u ...,house,2023-2024
24,committee met pursuant notice room house visit...,house,2023-2024


In [128]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

# Initiate BERTopic
topic_model_2 = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)

# Run BERTopic model
topics, probabilities = topic_model_2.fit_transform(house_subset['processed_text_v2'])

In [129]:
# Get the list of topics
topic_model_2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,26,-1_russia_ukraine_people_war,"[russia, ukraine, people, war, right, one, eur...",[subcommittee met pursuant notice pm via webex...


Joint 2023-2024

In [100]:
# Create a subset of the data where 'congress' column is 'senate' and 'period' is '2022-2023'
joint_subset = df[(df['congress'] == 'joint')]#& (df['period'] == '2023-2024')]

joint_subset

Unnamed: 0,processed_text_v2,congress,period
0,opening statement hon joe wilson u house south...,joint,2023-2024
1,opening statement joe wilson u house south car...,joint,2023-2024
2,opening statement joe wilson u house south car...,joint,2023-2024
3,opening statement hon joe wilson u house south...,joint,2023-2024
4,opening statement hon steve cohen ranking memb...,joint,2023-2024
25,opening statement hon ben cardin u senate mary...,joint,2021-2022
26,opening statement hon ben cardin u maryland ca...,joint,2021-2022


In [107]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=2,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

# Initiate BERTopic
topic_model_3 = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)

# Run BERTopic model
topics, probabilities = topic_model_3.fit_transform(joint_subset['processed_text_v2'])

ValueError: ignored

In [57]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=12)