# **NLP Research Project** - BERTopic Total Visualizations
## Description
- This program uses the csv file generated by pushShift.py + a trained BERTopic model to extract the major topics from the target dataset.

Author: Joseph A. Tomasello

In [None]:
!pip install pandas bertopic nltk

In [None]:
# Used to connect to the Kean University GPU server
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [None]:
import os
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.plotting._hierarchical_documents import visualize_hierarchical_documents
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import pickle

def main():
    # Load the dataset from the CSV file
    df = pd.read_csv('dataset_all_posts_4years_sorted.csv', usecols=['datetime', 'selftext'])
    df = df[~df['selftext'].isin(['[deleted]', '[removed]'])]  # Drop unwanted rows directly
    # Handling missing values
    df.dropna(subset=['selftext'], inplace=True)

    # Preprocessing
    df['datetime'] = pd.to_datetime(df['datetime'])
    print(df['datetime'])
    df['year'] = df['datetime'].dt.year

    subset_df = df[(df['datetime'] >= "2020-03-12 00:00:00") & (df['datetime'] < "2021-03-12 00:00:00")]

    text = subset_df['selftext']
    docs = list(text)

    print("Number of documents:", len(docs))

    # Check for missing values or outliers
    print("Number of missing values:", df['selftext'].isnull().sum())
    print("Max length of a document:", df['selftext'].str.len().max())

    # Debugging print to check document lengths before and after truncation
    print("Document lengths after truncation:")
    print(df['selftext'].apply(len).describe())

    # Check for NaN values
    print("Number of NaN values:", df['selftext'].isna().sum())

    try:
        # Check for infinite values
        print("Number of infinite values:", np.isinf(df['selftext']).sum())
    except TypeError:
        print("Unable to check for infinite values due to data type mismatch.")

    timestamps = pd.to_datetime(subset_df['datetime']).dt.to_period('m')
    timestamps = [str(x) for x in timestamps]
    timestamps = list(timestamps)
    timestamps.sort()
    docs = [str(x) for x in docs]

    filename = "bertopic_model_2020.pkl"
    model = pickle.load(open(filename, 'rb')) 

    # Print topic information
    print(model.get_topic_info())
    freq = model.get_topic_info()
    freq.head(10)

    # Intertopic distance
    model.visualize_topics().show()

    # Visualize barchart
    model.visualize_barchart().show()

    # Visualize heatmap
    model.visualize_heatmap().show()

    hierarchical_topics = model.hierarchical_topics(docs)
    model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()

if __name__ == "__main__":
    main()