In [4]:
# author: fabio

In [3]:
from pathlib import Path
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "browser"

# Load and wrangle the data

In [8]:
# read the data
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_lemma.csv.gzip"
tweets = pd.read_csv(
    Path(src, fname),
    compression="gzip",
    dtype={"id":str}
)
tweets = tweets.dropna(subset=["lemmatized"])
tweets.head(3)

Unnamed: 0,id,author_id,lemmatized,party,avg_belief_score,avg_truth_score,classes_quant
0,1608863852950880256,211530910,two of my provision be include in the # ndaa ....,Republican,0.261409,0.327746,rn
1,1608516681701244929,211530910,# 2022RECAP : do we want the world fuel by Ame...,Republican,0.709661,0.649962,rb
2,1608499794997116928,211530910,I vote for legislation that will spur innovati...,Republican,0.683822,0.67294,rt


# Fit the model

In [None]:
# increase `n_neigbors` and/or `min_topic_size` if too many topics:
umap_model = UMAP(
    n_neighbors=150, 
    n_components=5, 
    metric="cosine", 
    low_memory=False
)

vectorizer_model = CountVectorizer(min_df=50)

topic_model = BERTopic(
    verbose=True,
    nr_topics="auto",
    min_topic_size=200,
    umap_model=umap_model,
    top_n_words=10,
    vectorizer_model=vectorizer_model,
    language="english"
)
topics, probs = topic_model.fit_transform(tweets.lemmatized.to_list())

In [None]:
# save the model
dst = "../../data/tweets"
fname = "BERTopic_model"
topic_model.save(Path(dst, fname))

# Export topic data

In [None]:
# create a table with docs and respective topics
df_all_docs = pd.DataFrame({
    "id": tweets["id"], 
    "party": tweets["party"], 
    'topic': topics
})

# save for plotting belief-speaking similarity and truth-seeking
# similarity in supplementary figure 5
dst = "../../data/tweets"
fname = "topics_all_docs.csv.gzip"
df_all_docs.to_csv(
    Path(dst, fname),
    compression="gzip",
    index=False
)

topics_per_class = topic_model.topics_per_class(
    docs=docs, 
    topics=topics, 
    classes=tweets["classes_quant"], 
    global_tuning=True
)
fname = "topics_per_class_ddr.csv"
topics_per_class.to_csv(Path(dst, fname), index=False)

In [None]:
# check how many topics the model found
dst = "../../data/tweets"
fname = "topics_info.csv"
topic_info = pd.DataFrame(topic_model.get_topic_info())
topic_info.to_csv(Path(dst, fname), index=False)

# Inspect the model

In [None]:
src = "../../data/tweets"
fname = "BERTopic_model"
topic_model = BERTopic.load(Path(src, fname))

In [None]:
# visualize the topics (call `x.write_html(path)` to save the graphs)
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap(n_clusters=20)

In [None]:
# Dynamic Topic Modelling (topics over time):
topics, probs = topic_model.transform(docs)
timestamps = data.created_at.to_list()
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps, nr_bins=20)

In [None]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=100)

# Prepare data to create the scattertext plot

In [None]:
topics_ids = []  # insert topic id(s)
topics_words = []
topic_embeddings = []

In [None]:
for id in topics_ids:
    word_list = []
    embedding_list = []
    for i in range(
        0, 20
    ):  # insert how many words you want to display (=< top_n_words in model computation)
        word = topic_model.get_topic(id)[i][0]
        word_list.append(word)
        embedding = topic_model.get_topic(id)[i][1]
        embedding_list.append(embedding)
    topics_words.append(word_list)
    topic_embeddings.append(embedding_list)


In [None]:
wordcloud_df = pd.DataFrame()
wordcloud_df["topic_ids"] = topics_ids
wordcloud_df["topic_words"] = topics_words
wordcloud_df["topic_embeddings"] = topic_embeddings

In [None]:
dst = "../../data/tweets"
fname = "key_topics.csv"
wordcloud_df = wordcloud_df.set_index(["topic_ids"]).apply(pd.Series.explode).reset_index()
wordcloud_df.to_csv(Path(dst, fname))