In [2]:
# data_loader.py

import pandas as pd
DATA_PATH = "/home/kw215/Documents/research_codes/Topic-modeling-evaluations/notebooks/mod_actions.csv"
import string

def preprocess_text(text):
    """
    Preprocesses the given text.

    Parameters:
    - text: Text to be preprocessed.

    Returns:
    - Preprocessed text.
    """

    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = text.strip()
    # Remove URLs
    text = text.replace(r"http\S+", "")
    # # Remove all non-ASCII characters
    # text = text.encode("ascii", "ignore").decode()
    # # Remove all non-word characters (everything except numbers and letters)
    # import re
    # text = re.sub(r"[^\w\s]", '', text)
    # # remove all digits
    # text = re.sub(r"\d+", '', text)
    # # remove all single characters
    # text = re.sub(r"\b[a-zA-Z]\b", '', text)
    return text

def load_data(path):
    """
    Loads the dataset from the given path and preprocesses it.

    Parameters:
    - path: Path to the dataset file.

    Returns:
    - DataFrame containing the processed data.
    """
    df = pd.read_csv(path)
    df = df[(df['action'] == 'removelink') | (df['action'] == 'approvelink')]
    df = df[(df['mod'] != 'AutoModerator') & (df['target_title'] != '[ Removed by Reddit ]')]
    # df['text'] = 'Question: ' + df['target_title'] + '\n' + 'Description: ' + df['target_body']
    df['text'] = df['target_title'] + '\n' + df['target_body']
    df['text'] = df['text'].apply(preprocess_text)
    # Remove duplicates
    df = df.drop_duplicates(subset=['text'],keep='last')
    print(df['text'].head())
    return df

def output_results(topic_model, sub_topic_models):
    """
    Outputs the results of the topic modeling.

    Parameters:
    - topic_model: The main topic model.
    - sub_topic_models: Dictionary of sub-topic models.

    This function should be extended to format and output the results as required.
    """
    # Example output code (to be modified as per specific requirements)
    print("Main Topics:")
    print(topic_model.get_topic_info())

    for topic_id, model in sub_topic_models.items():
        print(f"Sub-Topics for Topic {topic_id}:")
        print(model.get_topic_info())



In [3]:

data = load_data(DATA_PATH)


1     good benadryl dosage for beginners\nwhat’s a g...
2     psilocybin mushrooms and possible false memory...
11    am i shadowbanned or hacked  no i’m not on dru...
12    in a hypothetical experiment if someone was gi...
13    mg retarded fucking brother may be in danger w...
Name: text, dtype: object


In [4]:
def format_data_for_topic_modeling(dataset):
    # Flatten the dataset to find the unique words and sort them
    unique_words = sorted(set(word for doc in dataset for word in doc))

    # Write the sorted vocabulary to 'vocabulary.txt'
    with open('vocabulary.txt', 'w') as vocab_file:
        for word in unique_words:
            vocab_file.write(f"{word}\n")

    # Write the corpus to 'corpus.tsv', using actual words
    with open('corpus.tsv', 'w') as corpus_file:
        for doc in dataset:
            # Join the words in the document with a space and write to file
            corpus_file.write(' '.join(doc) + '\n')


In [5]:
documents = [v.split() for v in data['text'].values]
format_data_for_topic_modeling(documents)
