# BERTopic
## Topic models
- one with nr_topics = "auto" --> advice of developer of BERTopic
- one without specification of topics only set min_topic_size to 30
- one with nr_topics = 30 --> overview of topics

- if running it again --> change the name of the model before saving it. Otherwise the previous model gets overwritten

Saving models:
- save models but w/o outlier reduction
- outlier reduction after saving the model to keep the original strucutre -- BERTopic_evaluation_analysis.ipynb

In [1]:
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def load_dataset(csv:str):
    """"
    loads the mommit and daddit data and drop June and October 2020 --> decided later to disregard time points
    """
    # load in dataset
    data= pd.read_csv(csv, sep = ';').iloc[:,1:]
    # filter out the rows that fall between the given dates
    mask = (data['date_time'] < '2020-06-01 00:00:00') | ((data['date_time'] > '2020-06-30 23:59:59')& (data['date_time'] < '2020-10-01 00:00:00')) | (data['date_time'] > '2020-10-31 23:59:59')
    data_corr_times = data.loc[mask]

    # reset index after filtering
    data_corr_times=data_corr_times.reset_index()
    # create a list of the text column
    text=data_corr_times["whole_text"].to_list()

    return data_corr_times, text

In [3]:
# load clean mommit and daddit data
mommit, mommit_text=load_dataset(csv="mommit_clean.csv")
daddit, daddit_text=load_dataset(csv="daddit_clean.csv")

print(len(mommit_text)) # 77600
print(len(daddit_text)) # 78963

77599
78963


## r/daddit

### no specification of topic nr
- only set min_topic_size to 30 
- use ctfidf set to reduce frequent words

In [4]:
# reduce frequent words (alternative to stopwords with CountVectorizer)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# initialize BERTopic object: BERTopic() creates an instance of the BERTopic class
# min topic size =30
model = BERTopic(embedding_model="all-MiniLM-L6-v2", ctfidf_model=ctfidf_model, n_gram_range=(1,2), calculate_probabilities=True, min_topic_size=30)

# fit model: take all documents, embed them and predict on how and where the documents are clustered
topics, probs = model.fit_transform(daddit_text)

In [5]:
# results
topic_info=model.get_topic_info()
df_topic_freq=model.get_topic_freq()
document_info=model.get_document_info(daddit_text)

print(len(topic_info))

217


In [None]:
# visualize results
fig_topics=model.visualize_topics()
fig_topics.show()

In [6]:
# save model
model.save("topic_models/Ansatz_march_2021+2022/daddit/BERTopic_daddit_dec19_march22") # 217

### topic_nr= "auto"

In [18]:
# reduce frequent words (alternative to stopwords with CountVectorizer)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) 

# initialize BERTopic object with auto topics
model_auto_daddit = BERTopic(embedding_model="all-MiniLM-L6-v2", ctfidf_model=ctfidf_model, n_gram_range=(1,2), calculate_probabilities=True, nr_topics="auto") 

# fit model
topics, probs = model_auto_daddit.fit_transform(daddit_text)

In [19]:
# results
topic_info_auto=model_auto_daddit.get_topic_info()
df_topic_freq_auto=model_auto_daddit.get_topic_freq()
document_info_auto=model_auto_daddit.get_document_info(daddit_text)

print(len(topic_info_auto))

156


In [21]:
topic_info_auto
print("stop")

stop


In [20]:
# visualize results
fig_topics=model_auto_daddit.visualize_topics()
fig_topics.show()

In [22]:
# save auto topics model
model_auto_daddit.save("topic_models/Ansatz_march_2021+2022/daddit/BERTopic_daddit_dec19_march22_auto") # 156 topics

### topic_nr = 30

In [4]:
# reduce frequent words (alternative to stopwords with CountVectorizer)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) 

# initialize BERTopic object with 30 topics
model_daddit_30 = BERTopic(embedding_model="all-MiniLM-L6-v2", ctfidf_model=ctfidf_model, n_gram_range=(1,2), calculate_probabilities=True, nr_topics=30)

# fit model
topics, probs = model_daddit_30.fit_transform(daddit_text)

In [5]:
# results
topic_info_30=model_daddit_30.get_topic_info()
df_topic_freq_30=model_daddit_30.get_topic_freq()
document_info_30=model_daddit_30.get_document_info(daddit_text)

print(len(topic_info_30))

31


In [6]:
# save model with 30 topics
model_daddit_30.save("topic_models/Ansatz_march_2021+2022/daddit/BERTopic_daddit_dec19_march22_30topics") # 30 topics

## r/Mommit

### no specification of topic nr
- only set min_topic_size to 30 
- use ctfidf set to reduce frequent words

In [13]:
# reduce frequent words (alternative to stopwords with CountVectorizer)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) 

# initialize BERTopic object
model = BERTopic(embedding_model="all-MiniLM-L6-v2", ctfidf_model=ctfidf_model, n_gram_range=(1,2), calculate_probabilities=True, min_topic_size=30) # at least 30 documents per topic

# fit model
topics, probs = model.fit_transform(mommit_text)

In [14]:
# results
topic_info=model.get_topic_info()
df_topic_freq=model.get_topic_freq()
document_info=model.get_document_info(mommit_text)

print(len(topic_info))

228


In [15]:
# visualize results
fig_topics=model.visualize_topics()
fig_topics.show()

In [17]:
# save model with at least 30 documents per topic
model.save("topic_models/Ansatz_march_2021+2022/mommit/BERTopic_mommit_dec19_march22") # 228

### nr topic = auto

In [25]:
 # reduce frequent words (alternative to stopwords with CountVectorizer)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# initialize BERTopic object with auto topics
model_auto_mommit = BERTopic(embedding_model="all-MiniLM-L6-v2", ctfidf_model=ctfidf_model, n_gram_range=(1,2), calculate_probabilities=True, nr_topics="auto")

# fit model
topics, probs = model_auto_mommit.fit_transform(mommit_text)

In [27]:
# results
topic_info_auto=model_auto_mommit.get_topic_info()
df_topic_freq_auto=model_auto_mommit.get_topic_freq()
document_info_auto=model_auto_mommit.get_document_info(mommit_text)

print(len(topic_info_auto))

490


In [23]:
# visualize results
fig_topics=model_auto_mommit.visualize_topics()
fig_topics.show()

In [28]:
# save auto model
model_auto_mommit.save("topic_models/Ansatz_march_2021+2022/mommit/BERTopic_mommit_dec19_march22_auto") # 490

#### topic nr = 30

In [30]:
# reduce frequent words (alternative to stopwords with CountVectorizer)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) 

# initialize BERTopic object with 30 topics
model_mommit_30 = BERTopic(embedding_model="all-MiniLM-L6-v2", ctfidf_model=ctfidf_model, n_gram_range=(1,2), calculate_probabilities=True, nr_topics=30)

# fit model
topics, probs = model_mommit_30.fit_transform(mommit_text)

In [31]:
# results
topic_info_30=model_mommit_30.get_topic_info()
df_topic_freq_30=model_mommit_30.get_topic_freq()
document_info_30=model_mommit_30.get_document_info(mommit_text)

print(len(topic_info_30))

31


In [32]:
# save model with 30 topics
model_mommit_30.save("topic_models/Ansatz_march_2021+2022/mommit/BERTopic_mommit_dec19_march22_30topics") # 30 topics