# **NLP Research Project** - Training the BERTopic Model
## Description
- This program uses the csv file generated by pushShift.py to train a BERTopic model for the purpose of generating a topic model.

Author: Joseph A. Tomasello

In [None]:
!pip install pandas bertopic nltk

In [None]:
# Used to connect to the Kean University GPU server
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2,3'

In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import numpy as np

def main():
    # Load the dataset from the CSV file
    df = pd.read_csv('dataset_all_posts_4years_sorted.csv', usecols=['datetime', 'selftext'])
    # Drop unwanted rows directly
    df = df[~df['selftext'].isin(['[deleted]', '[removed]'])]
    # Handling missing values
    df.dropna(subset=['selftext'], inplace=True)

    # Preprocessing
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['year'] = df['datetime'].dt.year

    subset_df = df[(df['datetime'] >= "2021-03-12 00:00:00") & (df['datetime'] < "2022-03-12 00:00:00")]

    text = subset_df['selftext']
    docs = list(text)

    print("Number of documents:", len(docs))

    # Check for missing values or outliers
    print("Number of missing values:", df['selftext'].isnull().sum())
    print("Max length of a document:", df['selftext'].str.len().max())

    # Debugging print to check document lengths before and after truncation
    print("Document lengths after truncation:")
    print(df['selftext'].apply(len).describe())

    # Check for NaN values
    print("Number of NaN values:", df['selftext'].isna().sum())

    try:
        # Check for infinite values
        print("Number of infinite values:", np.isinf(df['selftext']).sum())
    except TypeError:
        print("Unable to check for infinite values due to data type mismatch.")

    timestamps = pd.to_datetime(subset_df['datetime']).dt.to_period('m')
    timestamps = [str(x) for x in timestamps]
    timestamps = list(timestamps)
    timestamps.sort()
    docs = [str(x) for x in docs]

    # Set the model and parameters
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")

    # Initialize SentenceTransformer model
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Use DataParallel for multiple GPUs
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs")
        sentence_model = torch.nn.DataParallel(sentence_model)

    # Move model to GPU
    sentence_model = sentence_model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize BERTopic with SentenceTransformer model
    model = BERTopic(
        embedding_model=sentence_model,
        vectorizer_model=vectorizer_model,
        language='english',
        calculate_probabilities=True
    )

    # Fit the model
    topics, probs = model.fit_transform(docs)

    # Save the model
    with open(f'bertopic_model_2021.pkl', 'wb') as f:
        pickle.dump(model, f)

    # Print topic information
    print(model.get_topic_info())
    freq = model.get_topic_info()
    freq.head(10)

if __name__ == "__main__":
    main()