# **NLP Research Project** - BERTopic Time Series Visualizations
## Description
- This program uses the csv file generated by pushShift.py + a trained BERTopic model to extract the major topics at monthly intervals from a 1-year subset of the dataset. The results are then visualized over that 1-year span using a line graph.

Author: Joseph A. Tomasello

In [None]:
!pip install pandas bertopic nltk

In [None]:
# Used to connect to the Kean University GPU server
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2,3'

In [None]:
import os
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.plotting._hierarchical_documents import visualize_hierarchical_documents
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import pickle

def main():
    # Load the dataset from the CSV file
    df = pd.read_csv('dataset_all_posts_4years_sorted.csv', usecols=['datetime', 'selftext'])
    df = df[~df['selftext'].isin(['[deleted]', '[removed]'])]  # Drop unwanted rows directly
    # Handling missing values
    df.dropna(subset=['selftext'], inplace=True)

    # Preprocessing
    df['datetime'] = pd.to_datetime(df['datetime'])
    # print(df['datetime'])
    df['year'] = df['datetime'].dt.year

    subset_df = df[(df['datetime'] >= "2020-03-12 00:00:00") & (df['datetime'] < "2021-03-12 00:00:00")]

    text = subset_df['selftext']
    docs = list(text)

    timestamps = pd.to_datetime(subset_df['datetime']).dt.to_period('m')
    timestamps = [str(x) for x in timestamps]
    timestamps = list(timestamps)
    timestamps.sort()
    print("# of Timestamps:", len(timestamps))
    docs = [str(x) for x in docs]

    filename = "bertopic_model_2020.pkl"
    model = pickle.load(open(filename, 'rb'))

    # Visualize topics over time
    topics_over_time = model.topics_over_time(docs, timestamps, nr_bins=20)
    model.visualize_topics_over_time(topics_over_time).show()

if __name__ == "__main__":
    main()