In [1]:
import json
from gensim import corpora, models
from pprint import pprint
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the NLTK stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Specify the path to your watch history file
watch_history_file = '/home/lrozinskas/CS128/Data_folder/history/watch-history.json'

# Read the JSON data
with open(watch_history_file, 'r') as f:
    data = json.load(f)

# Extract titles from the data
titles = [entry['title'] for entry in data]

# Tokenize and preprocess the data, removing stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('watched')
tokenized_titles = [word_tokenize(title.lower()) for title in titles]
filtered_tokenized_titles = [[word for word in tokens if word.isalnum() and word not in stop_words] for tokens in tokenized_titles]

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(filtered_tokenized_titles)
corpus = [dictionary.doc2bow(tokens) for tokens in filtered_tokenized_titles]

# Build the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print the top words for each topic
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(f"Topic {topic[0] + 1}: {topic[1]}")

# Visualize the topics using pyLDAvis
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lrozinskas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/lrozinskas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic 1: 0.023*"https" + 0.022*"2021" + 0.019*"grand" + 0.019*"prix" + 0.018*"shorts"
Topic 2: 0.033*"mlb" + 0.028*"f1" + 0.025*"show" + 0.024*"21" + 0.019*"team"
Topic 3: 0.025*"diamond" + 0.018*"dynasty" + 0.015*"perfect" + 0.013*"dude" + 0.011*"battle"
Topic 4: 0.023*"nba" + 0.021*"undisputed" + 0.014*"lebron" + 0.013*"shorts" + 0.013*"shannon"
Topic 5: 0.022*"game" + 0.019*"nba" + 0.014*"test" + 0.014*"taste" + 0.014*"shorts"
