# Coherence score

In [None]:
# import needed libraries and modules for text preprocessing and topic modeling by using LDA algorithm (Latent Dirichlet Allocation)
import re
import nltk
import torch
import emoji
import random
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from gensim.models import CoherenceModel
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# download necessary nltk resources for text preprocessing
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# set random seed for reproducibility of results across different runs of the code
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

#### Text Preprocessing

In [None]:
# define Text Preprocessor class
class TextPreprocessor:
    # initialize the TextPreprocessor class with necessary attributes and methods for text preprocessing tasks
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = WordPunctTokenizer()
        # define a list of singer names to remove from text data for better topic modeling results (case-insensitive)
        self.singer_names = [
            "Bad Bunny", "El Conejo Malo", "The Weeknd", "Abel", "Abel Tesfaye",
            "Morgan Wallen", "Ed Sheeran", "Ginger Jesus", "Ed", "Drake", 
            "Drizzy", "Champagne Papi", "Aubrey", "Harry Styles", "Feid", 
            "Imagine Dragons", "Dan Reynolds", "Ben McKee", "Daniel Wayne Sermon", 
            "Daniel Platz Platzman", "Post Malone", "Posty", "BTS", "Bangtan", 
            "Bangtan Sonyeondan", "Tannies", "RM", "Jin", "Suga", "J-Hope", 
            "Jimin", "V", "Jungkook", "harry", "justin bieber", "bieber", "justin", "namjoon", 
            "Taylor Swift" , "T-Swift" , "TayTay" , "Taylor" , "Miss Americana", "SZA", "Solana Imani Row", 
            "Miley Cyrus", "Miley", "Hannah Montana", "New Jeans", "Minji", "Hanni", "Danielle New Jeans", "Haerin", "Hyein", 
            "Dua Lipa", "Dua", "Dula Peep", "Olivia Rodrigo", "Liv", "Ariana Grande", "Ari", "Ariana", "Ms Grande", 
            "Billie Eilish", "Billie", "Rihanna", "RiRi", "Badgalriri", "Adele", "swiftie", "Oliia", "newjean", 
            "rodrigo", "rodrigos"
        ]
        # convert singer names to lowercase for case-insensitive matching during text preprocessing tasks
        self.singer_names = [name.lower() for name in self.singer_names]

    # preprocess text data by removing singer names, emojis, special characters, numbers, and lemmatizing tokens for topic modeling tasks
    def preprocess_text(self, text):
        text = text.lower()
        text = self.remove_singer_names(text)
        text = emoji.demojize(text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)  # Remove numbers
        tokens = self.tokenizer.tokenize(text)
        tokens = [word for word in tokens if word not in self.stop_words and word.isalnum()]
        tokens = [self.lemmatize_token(word) for word in tokens]
        # return preprocessed text data as a single string with tokens separated by whitespace
        return " ".join(tokens)

    # remove singer names from text data for better topic modeling results and to avoid bias in topic assignments
    def remove_singer_names(self, text):
        for name in self.singer_names:
            text = text.replace(name, '')
        # replace common abbreviations and acronyms with full words for better topic modeling results
        text = text.replace('austin', 'album')
        text = text.replace('wts', 'want to sell')
        text = text.replace('merch', 'merchandise')
        # return text data with singer names removed for further preprocessing
        return text

    # lemmatize tokens in text data to reduce inflectional forms and sometimes derivationally related forms of words to a common base form
    def lemmatize_token(self, token):
        # get the part of speech tag for each token to lemmatize based on the WordNet lexical database and NLTK library functions
        tag = self.get_wordnet_pos(nltk.pos_tag([token])[0][1])
        # return lemmatized token based on the part of speech tag if available, otherwise return the original token
        return self.lemmatizer.lemmatize(token, pos=tag) if tag else token

    # get the WordNet part of speech tag for each token to lemmatize based on the Penn Treebank tag set and WordNet lexical database in NLTK
    @staticmethod
    def get_wordnet_pos(treebank_tag):
        # converts treebank tag to wordnet tag
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

In [None]:
# define a function to remove hyperlinks from text data for better topic modeling results
def remove_hyperlinks(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags = re.MULTILINE)

In [None]:
# define a function to preprocess text data for topic modeling tasks using the TextPreprocessor class and methods
def preprocess(df, data_type):
    # initialize the TextPreprocessor class for text preprocessing tasks in the dataset based on the data type (comments or submissions)
    if data_type == 'comments':
        text = 'body'
    elif data_type == 'submissions':
        text = 'combined_text'
    col_index = df.columns.get_loc(text) + 1
    # preprocess text data in the dataset using the TextPreprocessor class and methods
    preprocessed_text = df[text].apply(preprocessor.preprocess_text)
    df.insert(col_index, 'preprocessed_txt', preprocessed_text)
    return df

In [None]:
# import female and male submission dataset files for text analysis and topic modeling tasks
male_submissions = pd.read_csv("/home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/male_submissions_outcome_final.csv")
female_submissions = pd.read_csv("/home/haters/Downloads/Toxicity_Detection/output_perspective/output_score/female_submissions_outcome_final.csv")

# combine title and selftext columns into a single column for text analysis and topic modeling tasks in the dataset
male_submissions['combined_text'] = male_submissions['title'] + " " + male_submissions['selftext']
female_submissions['combined_text'] = female_submissions['title'] + " " + female_submissions['selftext']

# remove hyperlinks from text data in the combined_text column for better topic modeling results
male_submissions["combined_text"] = male_submissions["combined_text"].apply(remove_hyperlinks)
female_submissions["combined_text"] = female_submissions["combined_text"].apply(remove_hyperlinks)

In [None]:
# initialize the TextPreprocessor class for text preprocessing tasks in the dataset
preprocessor = TextPreprocessor()

# preprocess the male and female submission datasets by using the preprocess function with the TextPreprocessor class and methods
male_submissions = preprocess(male_submissions, 'submissions')
female_submissions = preprocess(female_submissions, 'submissions')

#### Define LDA Topic Modeling Function and Compute Coherence Scores for Different Numbers of Topics in the Dataset (Visualization and Interpretation too)

In [None]:
def lda_topic_modeling(texts, num_topics, stop_words='english', max_features=1000):
    # fit LDA model to the text data using the CountVectorizer for text preprocessing and feature extraction with the specified parameters for topic modeling
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stop_words, max_features=max_features)
    # transform the text data into a document-term matrix for topic modeling using the CountVectorizer for text preprocessing and feature extraction
    data_vectorized = vectorizer.fit_transform(texts)
    # fit LDA model to the text data using the document-term matrix for topic modeling with the specified parameters for topic modeling and text analysis
    lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online', random_state=100)
    # extract topics from the LDA model using the document-term matrix for topic modeling and text analysis
    lda_output = lda_model.fit_transform(data_vectorized)
    # return the LDA model, LDA output, and CountVectorizer for text preprocessing and feature extraction
    return lda_model, lda_output, vectorizer

def compute_coherence_values(texts, vectorizer, start=2, limit=15, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        # fit LDA model to the text data and extract topics using the CountVectorizer for text preprocessing and feature extraction
        lda_model, lda_output, vectorizer = lda_topic_modeling(texts, num_topics)
        # append lda model to list for each number of topics
        model_list.append(lda_model)
        # extract topics and convert to gensim format for coherence score computation
        feature_names = vectorizer.get_feature_names_out()
        topics = [[feature_names[i] for i in topic.argsort()[:-11:-1]] for topic in lda_model.components_]
        # compute coherence score 
        dictionary = gensim.corpora.Dictionary([text.split() for text in texts])
        # create a gensim corpus for the text data and dictionary for the topics in the LDA model for coherence score computation using the c_v metric (UMass)
        coherence_model_lda = CoherenceModel(topics=topics, texts=[text.split() for text in texts], dictionary=dictionary, coherence='c_v')
        # append coherence score to list for each number of topics 
        coherence_values.append(coherence_model_lda.get_coherence())
    return model_list, coherence_values

In [None]:
# fill missing values in preprocessed_txt column with empty strings and convert to string data type for text analysis and topic modeling tasks
male_submissions['preprocessed_txt'] = male_submissions['preprocessed_txt'].fillna('').astype(str)
female_submissions['preprocessed_txt'] = female_submissions['preprocessed_txt'].fillna('').astype(str)

In [None]:
# compute coherence scores for male/female submissions
model_list_male, coherence_values_male = compute_coherence_values(male_submissions['preprocessed_txt'].tolist(), CountVectorizer(stop_words='english'))
model_list_female, coherence_values_female = compute_coherence_values(female_submissions['preprocessed_txt'].tolist(), CountVectorizer(stop_words='english'))

In [None]:
# plot coherence scores for different numbers of topics to find the optimal number of topics for topic modeling tasks in the dataset 
x = range(2, 15, 1)
plt.figure(figsize=(10, 5))
# plot coherence scores for male and female submissions to compare the optimal number of topics for topic modeling tasks
plt.plot(x, coherence_values_male, label="Male Submissions", marker='o')
plt.plot(x, coherence_values_female, label="Female Submissions", marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend()
plt.title("Coherence Scores for Different Numbers of Topics")
plt.show()

In [None]:
# find the optimal number of topics
optimal_num_topics_male = x[coherence_values_male.index(max(coherence_values_male))]
optimal_num_topics_female = x[coherence_values_female.index(max(coherence_values_female))]

# print the optimal number of topics
print(f"Optimal number of topics for male submissions: {optimal_num_topics_male, max(coherence_values_male)}")
print(f"Optimal number of topics for female submissions: {optimal_num_topics_female, max(coherence_values_female) }")

In [None]:
# define a function to plot the top words for each topic in the LDA model for topic modeling tasks in the dataset
def plot_top_words(lda_model, feature_names, n_top_words, title):
    # plot the top words for each topic in the LDA model for topic modeling tasks in the dataset using matplotlib and seaborn libraries for data visualization
    num_topics = lda_model.n_components
    fig, axes = plt.subplots(1, num_topics, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    # iterate over each topic in the LDA model and plot the top words for each topic using a horizontal bar plot with the specified number of top words to display
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx + 1}', fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
    fig.suptitle(title, fontsize=40)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
# fit LDA model
lda_model_male, lda_output_male, vectorizer_male = lda_topic_modeling(male_submissions['preprocessed_txt'].tolist(), optimal_num_topics_male)
lda_model_female, lda_output_female, vectorizer_female = lda_topic_modeling(female_submissions['preprocessed_txt'].tolist(), optimal_num_topics_female)

# display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# display topics with top 10 words
no_top_words = 10
print("\nTopics in Male Submissions:")
display_topics(lda_model_male, vectorizer_male.get_feature_names_out(), no_top_words)

print("\nTopics in Female Submissions:")
display_topics(lda_model_female, vectorizer_female.get_feature_names_out(), no_top_words)

In [None]:
n_top_words = 10
plot_top_words(lda_model_male, vectorizer_male.get_feature_names_out(), n_top_words, f'Topics in Male Submissions ({optimal_num_topics_male} Topics)')
plot_top_words(lda_model_female, vectorizer_female.get_feature_names_out(), n_top_words, f'Topics in Female Submissions ({optimal_num_topics_female} Topics)')