In [None]:
# author: Fabio Carrella

Setting the environment:

In [3]:
from bertopic import BERTopic
import pandas as pd
import re
import numpy
from os.path import join
from umap import UMAP
from collections import Counter
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "browser"
from sklearn.feature_extraction.text import CountVectorizer

Importing the dataset:

In [6]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_lemmatized_text.csv.gzip"
data = pd.read_csv(join(src, fname), encoding="utf-8", compression="gzip").dropna()
data.head(2)

Unnamed: 0,id,text,tweet_id,author_id,conversation_id,created_at,retweet_count,reply_count,like_count,quote_count,...,understanding_terms,understanding_count.x,word_count,max_label,belief_label,truth_label,understanding_label,party,component,classes
0,1,# IXPE be up and away ! I can not wait to see ...,"""1468826484668305411""",20597460,"""1468826484668305411""",2021-12-09T06:14:58.000Z,24.0,14.0,216.0,1.0,...,['uncover'],1,27,understanding,0,0,1,Democrat,other,do
1,2,.@NASA mourn the loss of a beloved member of o...,"""1468326655789776906""",20597460,"""1468326655789776906""",2021-12-07T21:08:49.000Z,129.0,27.0,900.0,20.0,...,['realized'],1,40,understanding,0,0,1,Democrat,other,do


Cleaning the data and creating a list of docs for the model:

In [7]:
data.text = data.apply(lambda row: re.sub(r"http\S+", "", str(row.text)).lower(), 1)
docs = data.text.to_list()

Fitting the model (increase `n_neigbors` and/or `min_topic_size` if too many topics):

In [None]:
umap_model = UMAP(
    n_neighbors=100,
    n_components=5,
    metric='cosine',
    low_memory=False
)

vectorizer_model = CountVectorizer(min_df=50)

topic_model = BERTopic(
    verbose=True,
    nr_topics="auto",
    min_topic_size=200,
    umap_model=umap_model,
    top_n_words=20,
    vectorizer_model=vectorizer_model
)
topics, probs = topic_model.fit_transform(docs)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Batches:   0%|          | 0/48206 [00:00<?, ?it/s]

2022-06-21 14:00:57,862 - BERTopic - Transformed documents to Embeddings


Saving and loading the model:

In [None]:
topic_model.save("twitter_lemmatized")

In [None]:
topic_model = BERTopic.load("twitter_lemmatized") #insert path

Checking how many topics the model found:

In [6]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,867530,-1_to_and_in_be
1,0,22275,0_obamacare_aca_care_condition
2,1,18886,1_voting_vote_ballot_election
3,2,14776,2_abortion_woman_right_life
4,3,12707,3_gun_violence_background_shooting
...,...,...,...
373,372,206,372_heart_disease_february_awareness
374,373,206,373_hand_produce_donate_covid19
375,374,205,374_rush_radio_conservative_icon
376,375,204,375_conflict_condemn_civilian_violation


Visualising the topics (call `x.write_html(path)` to save the graphs):

In [7]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap(n_clusters=20)

Dynamic Topic Modelling (topics over time):

In [None]:
topics, probs = topic_model.transform(docs)

In [None]:
timestamps = data.created_at.to_list()
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps, nr_bins=20)

Creating topics per class dataframe to check for topic distribution over party/components

In [None]:
topics_per_class = topic_model.topics_per_class(docs, topics=topics, classes=data["classes"], global_tuning=True)

In [9]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=100)

In [10]:
dst = "../../data/tweets"
fname = "topics_per_class.csv"
topics_per_class.to_csv(join(dst, fname), index=False)

Preparing data for wordclouds (use with specific topics)

In [89]:
topics_ids = [] #insert topic id(s)
topics_words = []
topic_embeddings = []

In [90]:
for id in topics_ids:
    word_list = []
    embedding_list = []
    for i in range(0,20): # insert how many words you want to display (=< top_n_words in model computation)
        word = topic_model.get_topic(id)[i][0]
        word_list.append(word)
        embedding = topic_model.get_topic(id)[i][1]
        embedding_list.append(embedding)
    topics_words.append(word_list)
    topic_embeddings.append(embedding_list)

In [91]:
wordcloud_df = pd.DataFrame()
wordcloud_df["topic_ids"] = topics_ids
wordcloud_df["topic_words"] = topics_words
wordcloud_df["topic_embeddings"] = topic_embeddings

In [92]:
dst = "../../data/tweets"
fname = "key_topics.csv"
wordcloud_df\
    .set_index(["topic_ids"])\
    .apply(pd.Series.explode)\
    .reset_index()\
    .to_csv(join(dst, fname), index=False)