# Clustering categories of Reddit questions

In [1]:
# libraries

# nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download("stopwords")
nltk.download("punkt")

# other
from bertopic import BERTopic
import pandas as pd
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# functions

# Define a function to preprocess a single title
def preprocess(title):
  '''Clean and tokenize the title'''
  tokens = word_tokenize(title.lower())
  tokens = [word for word in tokens if word not in stopwords.words("english") and word not in string.punctuation]
  return " ".join(tokens)

In [3]:
df = pd.read_csv("raw-dataset-reddit.csv")
titles = df["title"].tolist()

preprocessed_titles = [preprocess(title) for title in titles]

In [4]:
# Apply BERTopic to create clusters based on the titles
model = BERTopic(nr_topics=40)
topics, _ = model.fit_transform(preprocessed_titles)

# Get the topic names by iterating through the topics
topic_names = {}
for topic_id in set(topics):
  if topic_id == -1:
    topic_names[topic_id] = "random"
  else:
    topic_words = model.get_topic(topic_id)
    topic_name = topic_words[0][0]  # Extract the first word from the tuple as the topic name
    topic_names[topic_id] = topic_name

{0: 'life', 1: 'men', 2: 'lie', 3: 'kids', 4: 'friend', 5: 'food', 6: 'song', 7: 'money', 8: 'would', 9: 'job', 10: 'subreddit', 11: 'city', 12: 'anxiety', 13: 'movie', 14: 'celebrity', 15: 'old', 16: 'sleep', 17: 'phone', 18: 'smell', 19: 'say', 20: 'fitness', 21: 'habit', 22: 'house', 23: 'pet', 24: 'medical', 25: 'color', 26: 'religion', 27: 'death', 28: 'law', 29: 'gift', 30: 'learn', 31: 'book', 32: 'karma', 33: 'india', 34: 'opinion', 35: 'name', 36: 'tip', 37: 'shave', 38: 'society', -1: 'random'}


In [None]:
print(topic_names)