In [None]:
!pip install git+https://github.com/huggingface/transformers.git -q -U # transformers version:  4.37.0
!pip install git+https://github.com/huggingface/accelerate.git -q -U # accelerate version:  0.27.0
!pip install bitsandbytes # bitsandbytes version:  0.42.0
!pip install git+https://github.com/huggingface/peft.git -q -U # peft version: 0.7.2
!pip install einops
!pip install xformers
!pip install torchvision
!pip install urllib3==1.26.15
!pip install scipy==1.12

In [None]:
# import necessary libraries and modules for data processing and analysis
import re
import gc
import os
import nltk
import torch
import emoji
import random
import numpy as np
import pandas as pd
import transformers
from umap import UMAP
from torch import cuda
from torch import bfloat16
from hdbscan import HDBSCAN
from bertopic import BERTopic
from huggingface_hub import login
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from sklearn.metrics import silhouette_score
from nltk.tokenize import WordPunctTokenizer
from sklearn.model_selection import ParameterGrid
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import OnlineCountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# download necessary NLTK resources and models
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# set random seed for reproducibility across runs and devices
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    
# set device to GPU if available, otherwise use CPU for processing
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

#### Text Preprocessing

In [None]:
# define Text Preprocessor class
class TextPreprocessor:
    # initialize the TextPreprocessor class with necessary attributes and methods for text preprocessing tasks
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = WordPunctTokenizer()
        # define a list of singer names to remove from text data for better topic modeling results (case-insensitive)
        self.singer_names = [
            "Bad Bunny", "El Conejo Malo", "The Weeknd", "Abel", "Abel Tesfaye",
            "Morgan Wallen", "Ed Sheeran", "Ginger Jesus", "Ed", "Drake", 
            "Drizzy", "Champagne Papi", "Aubrey", "Harry Styles", "Feid", 
            "Imagine Dragons", "Dan Reynolds", "Ben McKee", "Daniel Wayne Sermon", 
            "Daniel Platz Platzman", "Post Malone", "Posty", "BTS", "Bangtan", 
            "Bangtan Sonyeondan", "Tannies", "RM", "Jin", "Suga", "J-Hope", 
            "Jimin", "V", "Jungkook", "harry", "justin bieber", "bieber", "justin", "namjoon", 
            "Taylor Swift" , "T-Swift" , "TayTay" , "Taylor" , "Miss Americana", "SZA", "Solana Imani Row", 
            "Miley Cyrus", "Miley", "Hannah Montana", "New Jeans", "Minji", "Hanni", "Danielle New Jeans", "Haerin", "Hyein", 
            "Dua Lipa", "Dua", "Dula Peep", "Olivia Rodrigo", "Liv", "Ariana Grande", "Ari", "Ariana", "Ms Grande", 
            "Billie Eilish", "Billie", "Rihanna", "RiRi", "Badgalriri", "Adele", "swiftie", "Oliia", "newjean", 
            "rodrigo", "rodrigos"
        ]
        # convert singer names to lowercase for case-insensitive matching during text preprocessing tasks
        self.singer_names = [name.lower() for name in self.singer_names]

    # preprocess text data by removing singer names, emojis, special characters, numbers, and lemmatizing tokens for topic modeling tasks
    def preprocess_text(self, text):
        text = text.lower()
        text = self.remove_singer_names(text)
        text = emoji.demojize(text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)  # Remove numbers
        tokens = self.tokenizer.tokenize(text)
        tokens = [word for word in tokens if word not in self.stop_words and word.isalnum()]
        tokens = [self.lemmatize_token(word) for word in tokens]
        # return preprocessed text data as a single string with tokens separated by whitespace
        return " ".join(tokens)

    # remove singer names from text data for better topic modeling results and to avoid bias in topic assignments
    def remove_singer_names(self, text):
        for name in self.singer_names:
            text = text.replace(name, '')
        # replace common abbreviations and acronyms with full words for better topic modeling results
        text = text.replace('austin', 'album')
        text = text.replace('wts', 'want to sell')
        text = text.replace('merch', 'merchandise')
        # return text data with singer names removed for further preprocessing
        return text

    # lemmatize tokens in text data to reduce inflectional forms and sometimes derivationally related forms of words to a common base form
    def lemmatize_token(self, token):
        # get the part of speech tag for each token to lemmatize based on the WordNet lexical database and NLTK library functions
        tag = self.get_wordnet_pos(nltk.pos_tag([token])[0][1])
        # return lemmatized token based on the part of speech tag if available, otherwise return the original token
        return self.lemmatizer.lemmatize(token, pos=tag) if tag else token

    # get the WordNet part of speech tag for each token to lemmatize based on the Penn Treebank tag set and WordNet lexical database in NLTK
    @staticmethod
    def get_wordnet_pos(treebank_tag):
        # converts treebank tag to wordnet tag
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

In [None]:
# define a function to remove hyperlinks from text data for better topic modeling results
def remove_hyperlinks(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags = re.MULTILINE)

In [None]:
# define a function to preprocess text data for topic modeling tasks using the TextPreprocessor class and methods
def preprocess(df, data_type):
    # initialize the TextPreprocessor class for text preprocessing tasks in the dataset based on the data type (comments or submissions)
    if data_type == 'comments':
        text = 'body'
    elif data_type == 'submissions':
        text = 'combined_text'
    col_index = df.columns.get_loc(text) + 1
    # preprocess text data in the dataset using the TextPreprocessor class and methods
    preprocessed_text = df[text].apply(preprocessor.preprocess_text)
    df.insert(col_index, 'preprocessed_txt', preprocessed_text)
    return df

In [None]:
# import female and male submission dataset files for text analysis and topic modeling tasks
male_submissions = pd.read_csv("/home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/male_submissions_outcome_final.csv")
female_submissions = pd.read_csv("/home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/female_submissions_outcome_final.csv")

# combine title and selftext columns into a single column for text analysis and topic modeling tasks in the dataset
male_submissions['combined_text'] = male_submissions['title'] + " " + male_submissions['selftext']
female_submissions['combined_text'] = female_submissions['title'] + " " + female_submissions['selftext']

# remove hyperlinks from text data in the combined_text column for better topic modeling results
male_submissions["combined_text"] = male_submissions["combined_text"].apply(remove_hyperlinks)
female_submissions["combined_text"] = female_submissions["combined_text"].apply(remove_hyperlinks)

In [None]:
# initialize the TextPreprocessor class for text preprocessing tasks in the dataset
preprocessor = TextPreprocessor()

# preprocess the male and female submission datasets by using the preprocess function with the TextPreprocessor class and methods
male_submissions = preprocess(male_submissions, 'submissions')
female_submissions = preprocess(female_submissions, 'submissions')

#### Load model and tokenizer from Hugging Face model hub for text generation tasks

In [None]:
# login to the Hugging Face model hub to access and download pre-trained models
login(token = "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")

In [None]:
# define the BitsAndBytesConfig class for 4-bit quantization and bfloat16 computation in the model configuration settings for faster inference
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Enable second quantization after the first
    bnb_4bit_compute_dtype=torch.bfloat16  # Computation type
)

In [None]:
# load the pre-trained llama2 7b chat model and tokenizer for text generation tasks using the Hugging Face model hub
tokenizer = transformers.AutoTokenizer.from_pretrained("daryl149/llama-2-7b-chat-hf")
model = transformers.AutoModelForCausalLM.from_pretrained("daryl149/llama-2-7b-chat-hf", trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto', force_download=True)
# set the model to evaluation mode for inference tasks
model.eval()

In [None]:
# define the llama2 text generation pipeline for text generation tasks using the pre-trained model and tokenizer from the Hugging Face model hub with specific parameters for text generation tasks
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [None]:
# system prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

# example prompt demonstrating the output we are looking for (with using chain-of-thought method) for labeling topics based on the given documents and keywords in the prompt
example_prompt = """
I have a topic that contains the following documents:
- I've been checking Ticketmaster every day for Coldplay tickets but they're sold out everywhere. Does anyone have any suggestions on where else I might find them?
- Hello all! If anyone is looking to buy Arctic Monkeys tickets for the show in Paris on September 10th, I have two extra tickets available. We can arrange for a safe and secure transaction!
- I have general admission tickets for the upcoming Foo Fighters concert and I'm wondering how early I should get to the venue to get a good spot. Any tips from past attendees?
- Just scored VIP passes for Lollapalooza this year! I'm super excited to see Kendrick Lamar, Billie Eilish, and The Weeknd. Who else is going? Any advice on the best spots to watch the performances from?
- Does anyone know if there's a way to get refunds for canceled concerts? I had tickets for the Red Hot Chili Peppers show next month, but it got canceled and now I'm trying to figure out my options.

The topic is described by the following keywords: 'concert, tour, ticket, attend, stage, stadium, wembley, arena, floor, meet, ticketmaster, ticket, stubhub, presale, sale, sell, buy, fee, concert, paypal'.

To create a short label for this topic, follow these steps:

1. **Identify the main themes**: Analyze the documents and keywords to identify the main themes discussed.
2. **Summarize the core message**: Summarize the core message of the topic based on the identified themes.
3. **Craft a concise label**: Create a short, descriptive label that encapsulates the core message.

Think through these steps carefully and return only the final label.

[START OF LABEL]
Concert and ticket
[END OF LABEL]
"""

# our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

To create a short label for this topic, follow these steps:

1. **Identify the main themes**: Analyze the documents and keywords to identify the main themes discussed.
2. **Summarize the core message**: Summarize the core message of the topic based on the identified themes.
3. **Craft a concise label**: Create a short, descriptive label that encapsulates the core message.

Think through these steps carefully and return only the final label.

[/INST]
"""

# combine the system prompt, example prompt, and main prompt for labeling topics based on the given documents and keywords
prompt = system_prompt + example_prompt + main_prompt

In [None]:
# define the environment variables for the Hugging Face model hub and transformers library to enable parallelism and set the number of OpenMP threads for faster processing
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['OMP_NUM_THREADS'] = "4"
os.environ['OMP_MAX_ACTIVE_LEVELS'] = "2"

#### Define Evaluation Model Function for Topic Modeling Tasks with Hyperparameter Tuning

In [None]:
# define the evaluate model function to evaluate the performance of the UMAP and HDBSCAN models with specific parameters for topic modeling tasks (this function mainly being used for hyperparameter tuning process)
def evaluate_model(umap_params, hdbscan_params, selftext, embeddings):
    # define the UMAP and HDBSCAN models with specific parameters for dimensionality reduction and clustering tasks in the dataset for topic modeling
    umap_model = UMAP(**umap_params)
    hdbscan_model = HDBSCAN(**hdbscan_params)

    # fit UMAP and HDBSCAN models
    umap_embeddings = umap_model.fit_transform(embeddings)
    hdbscan_labels = hdbscan_model.fit_predict(umap_embeddings)

    # check the number of unique topics
    num_topics = len(set(hdbscan_labels)) - (1 if -1 in hdbscan_labels else 0)  # exclude noise points labeled as -1
    print(f"Number of clusters identified by HDBSCAN: {num_topics}")
    
    # fit those models into the BERTopic model for topic modeling tasks
    topic_model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            representation_model=representation_model,
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model,
            top_n_words=10,
            verbose=True
        )
    # get topics and probabilities from the BERTopic model for topic modeling tasks
    topics, probs = topic_model.fit_transform(selftext, embeddings)
    # calculate the silhouette score for the BERTopic model to evaluate the performance of the model
    silhouette_avg = silhouette_score(embeddings, topics)

    # clean up BERTopic model from memory to avoid memory leaks and bottleneck issues
    del topic_model
    del embeddings
    del umap_model
    del hdbscan_model
    del umap_embeddings
    del hdbscan_labels
    # collect garbage to free up memory and resources
    gc.collect()
    # return the silhouette score for the BERTopic model to evaluate the performance of the model
    return silhouette_avg

# define the parameter grid for hyperparameter tuning of the UMAP and HDBSCAN models with specific parameters for topic modeling tasks
umap_param_grid = [
    {'n_neighbors': 15, 'n_components': 2, 'min_dist': 0.0, 'metric': 'cosine'},
    {'n_neighbors': 25, 'n_components': 5, 'min_dist': 0.1, 'metric': 'euclidean'},
    {'n_neighbors': 10, 'n_components': 3, 'min_dist': 0.2, 'metric': 'manhattan'},
    {'n_neighbors': 20, 'n_components': 4, 'min_dist': 0.3, 'metric': 'manhattan'},
    {'n_neighbors': 30, 'n_components': 2, 'min_dist': 0.4, 'metric': 'manhattan'},
    {'n_neighbors': 35, 'n_components': 5, 'min_dist': 0.5, 'metric': 'cosine'},
    {'n_neighbors': 40, 'n_components': 3, 'min_dist': 0.6, 'metric': 'euclidean'},
    {'n_neighbors': 50, 'n_components': 4, 'min_dist': 0.7, 'metric': 'manhattan'},
    {'n_neighbors': 60, 'n_components': 2, 'min_dist': 0.8, 'metric': 'euclidean'},
    {'n_neighbors': 70, 'n_components': 5, 'min_dist': 0.9, 'metric': 'cosine'},
    {'n_neighbors': 80, 'n_components': 3, 'min_dist': 0.1, 'metric': 'cosine'},
    {'n_neighbors': 90, 'n_components': 4, 'min_dist': 0.2, 'metric': 'euclidean'},
    {'n_neighbors': 100, 'n_components': 5, 'min_dist': 0.3, 'metric': 'manhattan'}
]

hdbscan_param_grid = [
    {'min_cluster_size': 50, 'metric': 'euclidean', 'cluster_selection_method': 'eom'},
    {'min_cluster_size': 100, 'metric': 'manhattan', 'cluster_selection_method': 'leaf'},
    {'min_cluster_size': 30, 'metric': 'manhattan', 'cluster_selection_method': 'eom'},
    {'min_cluster_size': 80, 'metric': 'euclidean', 'cluster_selection_method': 'leaf'},
    {'min_cluster_size': 90, 'metric': 'manhattan', 'cluster_selection_method': 'eom'},
    {'min_cluster_size': 110, 'metric': 'manhattan', 'cluster_selection_method': 'leaf'},
    {'min_cluster_size': 140, 'metric': 'euclidean', 'cluster_selection_method': 'eom'},
    {'min_cluster_size': 150, 'metric': 'manhattan', 'cluster_selection_method': 'leaf'}
]

# create a list of parameter combinations
param_grid = list(ParameterGrid({
    'umap': umap_param_grid,
    'hdbscan': hdbscan_param_grid
}))

#### Topic Modeling for male dataset

In [None]:
# extract the preprocessed text data from the combined_male_submissions dataset
male_submissions_preprocessed_txt = male_submissions["preprocessed_txt"]

# load the pre-trained MiniLM model for sentence embeddings using the SentenceTransformer library and Hugging Face model hub
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# generate sentence embeddings for the preprocessed text data using the MiniLM model for topic modeling tasks with the SentenceTransformer library
embeddings = embedding_model.encode(male_submissions_preprocessed_txt, show_progress_bar = True)

In [None]:
# define the KeyBERT and MMR models with specific parameters for keyword extraction and text summarization tasks in the dataset for topic modeling
keybert = KeyBERTInspired()
mmr = MaximalMarginalRelevance(diversity = 0.3)
# define the Llama2 model for text generation tasks using the pre-trained model and tokenizer from the Hugging Face model hub with specific parameters for text generation tasks
llama2 = TextGeneration(generator, prompt = prompt)

# define the representation model with the KeyBERT, Llama2, and MMR models for text representation and summarization tasks in the dataset for topic modeling
representation_model = {
    "KeyBERT": keybert,
    "Llama2" : llama2,
    "MMR" : mmr,
}

In [None]:
# define the ClassTfidfTransformer and OnlineCountVectorizer models with specific parameters for text vectorization and transformation tasks in the dataset for topic modeling
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)
vectorizer_model = OnlineCountVectorizer(stop_words="english")

In [None]:
# evaluate the performance of the UMAP and HDBSCAN models with specific parameters for topic modeling tasks in the dataset based on the hyperparameter tuning process
with open('male_parameter_performance.txt', 'w') as file:
    # evaluate each combination
    best_score = -np.inf
    best_params = None

    for params in param_grid:
        umap_params = params['umap']
        hdbscan_params = params['hdbscan']
        score = evaluate_model(umap_params, hdbscan_params, male_submissions_preprocessed_txt, embeddings)

        # record the performance of each parameter combination
        file.write(f"UMAP Params: {umap_params}, HDBSCAN Params: {hdbscan_params}, Score: {score}\n")
        file.flush()  # ensure the data is written to the file immediately

        if score > best_score:
            best_score = score
            best_params = params

print(f"Best Score: {best_score}")
print(f"Best Parameters: {best_params}")

In [None]:
# define the best UMAP and HDBSCAN parameters for topic modeling tasks based on the hyperparameter tuning results (uncomment it for running the code with the best parameters after the hyperparameter tuning process) with will take few days to run the hyperparameter tuning process by a powerful GPU (not the lab GPU)

#best_umap_params = best_params['umap']
#best_hdbscan_params = best_params['hdbscan']
#best_umap_model = UMAP(**best_umap_params)
#best_hdbscan_model = HDBSCAN(**best_hdbscan_params)

In [None]:
# define the UMAP and HDBSCAN models with specific parameters for dimensionality reduction and clustering tasks in the dataset for topic modeling (the best hyperparameters that we tuned and we manually set them) with the best parameters, may check out the best parameters from the hyperparameter tuning process text file
# if using the above code then comment the below code
umap_model = UMAP(n_neighbors=80, n_components=3, min_dist=0.1, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='manhattan', cluster_selection_method='leaf', prediction_data=True)
# fit the embeddings into the UMAP model for dimensionality reduction tasks to visualize the data in lower dimensions
reduced_embeddings = UMAP(n_neighbors=80, n_components=3, min_dist=0.1, metric='cosine', random_state=42).fit_transform(embeddings)

In [None]:
# define the BERTopic model with tuned hyperparameters for topic modeling tasks in the dataset
topic_model = BERTopic(
  # sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
  top_n_words=10,
  verbose=True
)

# train model on the preprocessed text data and embeddings for topic modeling tasks
topics, probs = topic_model.fit_transform(male_submissions_preprocessed_txt, embeddings)

In [None]:
# get the topic information from the BERTopic model for topic modeling tasks in the dataset to analyze the topics and their distributions in the text data for better understanding
topic_info = topic_model.get_topic_info()

# define the topic labels and their corresponding topics based on the topic information from the BERTopic model for topic modeling tasks in the dataset
topic_labels = {row['Topic']: row['Name'] for _, row in topic_info.iterrows()}

# map the topic IDs to their corresponding labels for better understanding and analysis of the topics in the text data for topic modeling tasks
male_submissions['topic_id'] = topics
male_submissions['topic_label'] = male_submissions['topic_id'].map(topic_labels)

In [None]:
# load the male comments dataset file for text analysis and topic modeling tasks
male_comments = pd.read_csv("/home/haters/Downloads/loaded_data/Combined_data_29Apr/combined_male_comments.csv")

# merge the comments and submission dataset together based on the 'link_id' column to analyze the topics and their distributions in the text data for better understanding
merge_male_comments_submissions = male_comments.merge(male_submissions[['name', 'topic_id', 'topic_label']], 
                                left_on='link_id', 
                                right_on='name', 
                                how='left')

# drop the 'name' column
merge_male_comments_submissions.drop(columns=['name'], inplace=True)

In [None]:
# extract the null values from the merged dataset for further analysis and preprocessing tasks
missing_topic = merge_male_comments_submissions[merge_male_comments_submissions['topic_id'].isna() | merge_male_comments_submissions['topic_label'].isna()]

# reset the index of the missing topic dataset for better indexing and analysis
missing_topic.reset_index(drop = True, inplace = True)

# preprocess the missing topic dataset by using the preprocess function with the TextPreprocessor class and methods for text preprocessing tasks
missing_topic = preprocess(missing_topic, 'comments')

# get the topics and probabilities from the BERTopic model for topic modeling tasks in the missing topic dataset
comment_topics, comment_probs = topic_model.transform(missing_topic['preprocessed_txt'])

# update the missing topic dataset with the topics and probabilities from the BERTopic model for topic modeling tasks
missing_topic['topic_id'] = comment_topics
missing_topic['topic_prob'] = comment_probs
merge_male_comments_submissions.update(missing_topic)
merge_male_comments_submissions.fillna(-99, inplace=True)

In [None]:
# define the merged topics and their corresponding topics based on the topic information from the BERTopic model for topic modeling tasks in the dataset
merged_topics = {
    -1: 1,
    5: 1,
    7: 1,
    0: 2,
    1: 3,
    2: 4,
    8: 4,
    3: 5,
    4: 6,
    6: 7,
    9: 8,
    -99: 8
}

# update topic labels
topic_labels = {
    1: 'General discussions about music, including song recommendations and rap music.',
    2: 'Discussions around buying and selling photocards.',
    3: 'Personal stories and relationships.',
    4: 'Discussions about buying and selling event tickets, and concert experiences.',
    5: 'Polls and opinions on various topics.',
    6: 'Discussions about merchandise like shirts and hoodies.',
    7: 'Tips and discussions about avoiding scams.',
    8: 'Others undefineable topics'
}

# map the new topic IDs to their labels for better understanding and analysis of the topics in the text data for topic modeling tasks
male_submissions['topic_id'] = male_submissions['topic_id'].replace(merged_topics)
male_submissions['topic_label'] = male_submissions['topic_id'].map(topic_labels)

# map the new topic IDs to their labels for better understanding and analysis of the topics in the text data for topic modeling tasks
merge_male_comments_submissions['topic_id'] = merge_male_comments_submissions['topic_id'].replace(merged_topics)
merge_male_comments_submissions['topic_label'] = merge_male_comments_submissions['topic_id'].map(topic_labels)

In [None]:
# export the datasets with the topic labels and IDs to CSV files for further analysis and visualization tasks
merge_male_comments_submissions.to_csv("merge_female_comments_submissions.csv", index = False)

# visualize the documents in lower-dimensional space using the UMAP model for dimensionality reduction tasks with the BERTopic model for topic modeling tasks
topic_model.visualize_documents(male_submissions, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)

#### Topic Modeling for female dataset

In [None]:
# extract the preprocessed text data from the combined_female_submissions dataset
female_submissions_preprocessed_txt = female_submissions["preprocessed_txt"]

# generate sentence embeddings for the preprocessed text data using the MiniLM model for topic modeling tasks with the SentenceTransformer library
embeddings = embedding_model.encode(female_submissions_preprocessed_txt, show_progress_bar = True)

In [None]:
# define the KeyBERT and MMR models with specific parameters for keyword extraction and text summarization tasks in the dataset for topic modeling
keybert = KeyBERTInspired()
mmr = MaximalMarginalRelevance(diversity = 0.3)
# define the Llama2 model for text generation tasks using the pre-trained model and tokenizer from the Hugging Face model hub with specific parameters for text generation tasks
llama2 = TextGeneration(generator, prompt = prompt)

# define the representation model with the KeyBERT, Llama2, and MMR models for text representation and summarization tasks in the dataset for topic modeling
representation_model = {
    "KeyBERT": keybert,
    "Llama2" : llama2,
    "MMR" : mmr,
}

In [None]:
# define the ClassTfidfTransformer and OnlineCountVectorizer models with specific parameters for text vectorization and transformation tasks in the dataset for topic modeling
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)
vectorizer_model = OnlineCountVectorizer(stop_words="english")

In [None]:
# evaluate the performance of the UMAP and HDBSCAN models with specific parameters for topic modeling tasks in the dataset based on the hyperparameter tuning process
with open('female_parameter_performance.txt', 'w') as file:
    # evaluate each combination
    best_score = -np.inf
    best_params = None

    for params in param_grid:
        umap_params = params['umap']
        hdbscan_params = params['hdbscan']
        score = evaluate_model(umap_params, hdbscan_params, female_submissions_preprocessed_txt, embeddings)

        # record the performance of each parameter combination
        file.write(f"UMAP Params: {umap_params}, HDBSCAN Params: {hdbscan_params}, Score: {score}\n")
        file.flush()  # ensure the data is written to the file immediately

        if score > best_score:
            best_score = score
            best_params = params

print(f"Best Score: {best_score}")
print(f"Best Parameters: {best_params}")

In [None]:
# define the best UMAP and HDBSCAN parameters for topic modeling tasks based on the hyperparameter tuning results (uncomment it for running the code with the best parameters after the hyperparameter tuning process) with will take few days to run the hyperparameter tuning process by a powerful GPU (not the lab GPU)

#best_umap_params = best_params['umap']
#best_hdbscan_params = best_params['hdbscan']
#best_umap_model = UMAP(**best_umap_params)
#best_hdbscan_model = HDBSCAN(**best_hdbscan_params)

In [None]:
# define the UMAP and HDBSCAN models with specific parameters for dimensionality reduction and clustering tasks in the dataset for topic modeling (the best hyperparameters that we tuned and we manually set them) with the best parameters, may check out the best parameters from the hyperparameter tuning process text file
# if using the above code then comment the below code
umap_model = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='euclidean', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=120, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# fit the embeddings into the UMAP model for dimensionality reduction tasks to visualize the data in lower dimensions
reduced_embeddings = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='euclidean', random_state=42).fit_transform(embeddings)

# define the BERTopic model with tuned hyperparameters for topic modeling tasks in the dataset
topic_model = BERTopic(
  # sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
  top_n_words=10,
  verbose=True
)

# train model on the preprocessed text data and embeddings for topic modeling tasks
topics, probs = topic_model.fit_transform(female_submissions_preprocessed_txt, embeddings)

In [None]:
# get the topic information from the BERTopic model for topic modeling tasks in the dataset to analyze the topics and their distributions in the text data for better understanding
topic_info = topic_model.get_topic_info()

# define the topic labels and their corresponding topics based on the topic information from the BERTopic model for topic modeling tasks in the dataset
topic_labels = {row['Topic']: row['Name'] for _, row in topic_info.iterrows()}

# map the topic IDs to their corresponding labels for better understanding and analysis of the topics in the text data for topic modeling tasks
female_submissions['topic_id'] = topics
female_submissions['topic_label'] = female_submissions['topic_id'].map(topic_labels)

In [None]:
# load the female comments dataset file for text analysis and topic modeling tasks
female_comments = pd.read_csv("/home/haters/Downloads/loaded_data/Combined_data_29Apr/combined_female_comments.csv")

# merge the comments and submission dataset together based on the 'link_id' column to analyze the topics and their distributions in the text data for better understanding
merge_female_comments_submissions = female_comments.merge(female_submissions[['name', 'topic_id', 'topic_label']], 
                                left_on='link_id', 
                                right_on='name', 
                                how='left')

# drop the 'name' column
merge_female_comments_submissions.drop(columns=['name'], inplace=True)

In [None]:
# extract the null values from the merged dataset for further analysis and preprocessing tasks
missing_topic = merge_female_comments_submissions[merge_female_comments_submissions['topic_id'].isna() | merge_female_comments_submissions['topic_label'].isna()]

# reset the index of the missing topic dataset for better indexing and analysis
missing_topic.reset_index(drop = True, inplace = True)

# preprocess the missing topic dataset by using the preprocess function with the TextPreprocessor class and methods for text preprocessing tasks
missing_topic = preprocess(missing_topic, 'comments')

# get the topics and probabilities from the BERTopic model for topic modeling tasks in the missing topic dataset
comment_topics, comment_probs = topic_model.transform(missing_topic['preprocessed_txt'])

# update the missing topic dataset with the topics and probabilities from the BERTopic model for topic modeling tasks
missing_topic['topic_id'] = comment_topics
missing_topic['topic_prob'] = comment_probs
merge_female_comments_submissions.update(missing_topic)
merge_female_comments_submissions.fillna(-99, inplace=True)

In [None]:
# define the merged topics and their corresponding topics based on the topic information from the BERTopic model for topic modeling tasks in the dataset
merged_topics = {
    -1: 1,  # Concert and Ticket Issues
    2: 1,   # Concert and Ticket Issues (Resale)
    3: 1,   # Concert and Ticket Issues (General)
    5: 1,   # Concert and Ticket Issues (Shipping)
    12: 1,  # Concert and Ticket Issues (Discussion)
    14: 1,  # Concert and Ticket Discussion (General)
    15: 1,  # Concert and Tour Discussion (Tickets)
    20: 1,  # Concert and Ticket (General)
    7: 2,   # Concert and Fashion
    8: 2,   # Concert and Fashion (Celebration and Conversation)
    9: 2,   # Concert Posters and Wallpapers (Fashion-Related)
    10: 2,  # Concert and Fashion (Outfits)
    18: 2,  # Concert and Fashion (Crewnecks)
    4: 3,   # Merchandise Wanted
    11: 4,  # Concert and Friendship Bracelets
    19: 5,  # Covers and Remixes
    1: 6,   # Art and Creativity (Vinyl, CD, Art)
    13: 6,  # Art and Creativity (Tattoo and Art)
    17: 6,  # Art and Creativity (General)
    21: 7,  # Leak and Music
    0: 8,   # Others undefineable topics (random)
    6: 8,   # Others undefineable topics
    16: 8,   # Others undefineable topics (Bias Poll)
    -99: 8  # Others undefineable topics
}

topic_labels = {
    1: 'Concert and Ticket Issues: Discussions about buying, selling, and handling concert tickets, including resale, shipping, and general availability.',
    2: 'Concert and Fashion: Discussions about fashion choices for concerts, including outfits, jackets, and general concert-related fashion.',
    3: 'Merchandise Wanted: Conversations around buying and selling artist-related merchandise like hoodies, shirts, and other items.',
    4: 'Concert and Friendship Bracelets: Topics discussing the creation and exchange of friendship bracelets related to concerts.',
    5: 'Covers and Remixes: Discussions about cover versions of songs and remixes in the music community.',
    6: 'Art and Creativity: Conversations focused on tattoos, artwork, and other creative expressions related to music and concerts.',
    7: 'Leak and Music: Discussions and speculations around leaked music content, including upcoming releases and snippets.',
    8: 'Others undefineable topics'
}

female_submissions['topic_id'] = female_submissions['topic_id'].replace(merged_topics)
female_submissions['topic_label'] = female_submissions['topic_id'].map(topic_labels)

merge_female_comments_submissions['topic_id'] = merge_female_comments_submissions['topic_id'].replace(merged_topics)
merge_female_comments_submissions['topic_label'] = merge_female_comments_submissions['topic_id'].map(topic_labels)

In [None]:
# export the datasets with the topic labels and IDs to CSV files for further analysis and visualization tasks
merge_female_comments_submissions.to_csv("merge_female_comments_submissions.csv", index = False)

In [None]:
# visualize the documents in lower-dimensional space using the UMAP model for dimensionality reduction tasks with the BERTopic model for topic modeling tasks
topic_model.visualize_documents(female_submissions, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)