In [2]:
import json
from gensim import corpora, models
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the NLTK stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Specify the path to your watch history file
watch_history_file = '/home/lrozinskas/CS128/Data_folder/history/watch-history.json'

# Read the JSON data
with open(watch_history_file, 'r') as f:
    data = json.load(f)

# Extract titles from the data
titles = [entry['title'] for entry in data]

# Tokenize and preprocess the data, removing stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('watched')
tokenized_titles = [word_tokenize(title.lower()) for title in titles]
filtered_tokenized_titles = [[word for word in tokens if word.isalnum() and word not in stop_words] for tokens in tokenized_titles]

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(filtered_tokenized_titles)
corpus = [dictionary.doc2bow(tokens) for tokens in filtered_tokenized_titles]

# Build the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print the top words for each topic
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(f"Topic {topic[0] + 1}: {topic[1]}")

# Visualize the topics using pyLDAvis
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lrozinskas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/lrozinskas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic 1: 0.039*"nba" + 0.025*"game" + 0.021*"lakers" + 0.019*"undisputed" + 0.016*"first"
Topic 2: 0.033*"f1" + 0.023*"golf" + 0.015*"shorts" + 0.014*"10" + 0.013*"good"
Topic 3: 0.031*"test" + 0.030*"taste" + 0.014*"food" + 0.014*"shorts" + 0.014*"stephen"
Topic 4: 0.021*"diamond" + 0.020*"2021" + 0.019*"highlights" + 0.017*"grand" + 0.016*"prix"
Topic 5: 0.035*"mlb" + 0.029*"show" + 0.027*"21" + 0.019*"team" + 0.018*"https"
