### Inside the heads of Putnuts
#### An analysis of the Only Murders in the Building TV show subreddit

Milindi Kodikara

#### Step 0 : : Set up

In [None]:
from client import client
import helper
import visualiser

import string

import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
nltk.download('vader_lexicon')

from datetime import datetime
import calendar
import pandas as pd
import pyLDAvis.lda_model

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
subreddit_name = 'OnlyMurdersHulu'

In [None]:
reddit_client = client()

In [None]:
# sanity check
print('Username :: ', reddit_client.user.me())

In [None]:
subreddit = reddit_client.subreddit(subreddit_name)

print('Subreddit :: ', subreddit)

In [None]:
all_new_posts = [*subreddit.new(limit=None)] 

new_season_announcement_date = datetime(2023, 10, 4, 0, 0, 0)
season_premier = datetime(2024, 8, 27, 23, 59, 59)

timestamp_new_season_announcement_date = calendar.timegm(new_season_announcement_date.utctimetuple())
timestamp_season_premier = calendar.timegm(season_premier.utctimetuple())

posts = [post for post in all_new_posts if timestamp_new_season_announcement_date <= post.created_utc <= timestamp_season_premier]

print('New season announcement date: ', new_season_announcement_date.strftime("%d/%m/%Y"))
print('New season announcement date timestamp: ', timestamp_new_season_announcement_date)

print('Season premier: ', season_premier.strftime("%d/%m/%Y"))
print('Season premier timestamp: ', timestamp_season_premier)

print(f'New posts: {len(all_new_posts)}\nNew posts from the new season announcement leading up to season premier: {len(posts)}')

In [None]:
oliver = '#275c4d'
mabel = '#af221d'
charles = '#c59103'

In [None]:
# TODO: Ask why use Tweet Tokenizer, why not word_tokenizer
tokeniser = TweetTokenizer()
stemmer = nltk.stem.PorterStemmer()

# add punctuation to stopwords list
stop_words = stopwords.words('english') + list(string.punctuation) + ['rt', 'via', '...', '…', '"', "'", '`', '-']

In [None]:
unprocessed_token_lists = []
processed_token_lists = []

omitb_df = pd.DataFrame(columns=['Post', 'Num_comments', 'Author', 'UTC_Date', 'Date', 'Upvote_ratio', 'Unprocessed_tokens', 'Processed_tokens'])

for submission in posts:
    print_processing = True if posts.index(submission) <= 5 else False
    post_description = submission.selftext
    post_title = submission.title
    post_title_description = post_title + " " + post_description
    
    utc_date = submission.created_utc
    post_date = datetime.fromtimestamp(submission.created_utc).strftime("%d/%m/%Y")
    
    unprocessed_tokens = tokeniser.tokenize(post_title_description)
    unprocessed_token_lists.append(unprocessed_tokens)
    
    processed_tokens = helper.process(post_title_description, tokeniser, stemmer, stop_words, print_processing)
    # text, tokeniser, stop_words
    processed_token_lists.append(processed_tokens)
    
    submission.comments.replace_more(limit=None)
    for comment in submission.comments:
        comment_text = comment.body
        
        unprocessed_comment_tokens = tokeniser.tokenize(comment_text)
        unprocessed_tokens = unprocessed_tokens + unprocessed_comment_tokens
        unprocessed_token_lists.append(unprocessed_comment_tokens)
        
        processed_comment_tokens = helper.process(comment_text, tokeniser, stemmer, stop_words, False)
        processed_tokens = processed_tokens + processed_comment_tokens
        processed_token_lists.append(processed_comment_tokens)
    
    if submission.author is None:
        submission_author = 'None'
    else:
        submission_author = submission.author.name
        
    omitb_df.loc[len(omitb_df.index)] = [post_title_description, submission.num_comments, submission_author, utc_date, post_date, submission.upvote_ratio, unprocessed_tokens, processed_tokens]

In [None]:
len(omitb_df)

In [None]:
omitb_df.head()

In [None]:
omitb_df.tail()

In [None]:
post = omitb_df.loc[[0]]

post

In [None]:
print('Post: {}\n---------\nAuthor: {}\n---------\nUpvote ratio: {}\n---------\nCreated date: {}\n---------'.format(post['Post'][0], post['Author'][0], post['Upvote_ratio'][0], post['Date'][0]))

In [None]:
post['Post'][0]

In [None]:
post['Processed_tokens'][0]

#### Step 1 : : Exploration

In [None]:
total_num_posts = len(omitb_df)
print(f'Total number of posts: {total_num_posts}')

In [None]:
total_num_comments = omitb_df['Num_comments'].sum()
print(f'Total number of comments: {total_num_comments}')

In [None]:
total_data_items = total_num_posts + total_num_comments
print(f'Total data items: {total_data_items}')

In [None]:
ordered_by_date = omitb_df.sort_values(['UTC_Date'], ascending=True)
print(f'Posts at new season announcement:\n{ordered_by_date.head()}\n\n')
print(f'Posts at season premier:\n{ordered_by_date.tail()}')

In [None]:
# Posts per date
num_posts_per_date = omitb_df.groupby('Date')['Post'].count()

num_posts_per_date_ordered =num_posts_per_date.reset_index(name='count').sort_values(['count'], ascending=False)
print(f'Posts per date:\n{num_posts_per_date_ordered.head()}')

num_posts_per_date_y = num_posts_per_date.tolist()
dates_x = omitb_df['Date'].unique().tolist()

visualiser.generate_bar_chart(dates_x, num_posts_per_date_y, charles, 'Number of posts per date', 'Dates', 'Number of posts')

In [None]:
# Posts per author
num_posts_per_author = omitb_df.groupby('Author')['Post'].count()

num_posts_per_author_ordered =num_posts_per_author.reset_index(name='count').sort_values(['count'], ascending=False)
print(f'Posts per author:\n{num_posts_per_author_ordered.head()}')

num_posts_per_author_y = num_posts_per_author.tolist()
author_x = omitb_df['Author'].unique().tolist()

visualiser.generate_bar_chart(author_x, num_posts_per_author_y, mabel, 'Number of posts per author', 'Author', 'Number of posts')

In [None]:
# Comments per date
num_comments_per_date = omitb_df.groupby('Date')['Num_comments'].sum()

num_comments_per_date_ordered =num_comments_per_date.reset_index(name='sum').sort_values(['sum'], ascending=False)
print(f'Comments per date:\n{num_comments_per_date_ordered.head()}')

num_comments_per_date_y = num_comments_per_date.tolist()
date_x = omitb_df['Date'].unique().tolist()

visualiser.generate_bar_chart(date_x, num_comments_per_date_y, mabel, 'Number of comments per date', 'Dates', 'Number of comments')

In [None]:
# Comments vs upvote_ratio 
num_comments_y = omitb_df['Num_comments'].tolist()
upvote_ratio_x = omitb_df['Upvote_ratio'].tolist()

visualiser.generate_scatter_plot(upvote_ratio_x, num_comments_y, charles, 'Number of comments per upvote ratio', 'Upvote Ratio', 'Number of comments')


In [None]:
# Upvote_ration vs date
upvote_ratio_per_date = omitb_df.groupby('Date')['Upvote_ratio'].count()

upvote_ratio_per_date_ordered = upvote_ratio_per_date.reset_index(name='count').sort_values(['count'], ascending=False)
print(f'Upvote ratio per date:\n{upvote_ratio_per_date_ordered.head()}')

upvote_ratio_per_date_y = upvote_ratio_per_date.tolist()
date_x = omitb_df['Date'].unique().tolist()

visualiser.generate_bar_chart(date_x, upvote_ratio_per_date_y, oliver, 'Number of upvotes per date', 'Dates', 'Number of upvotes')

#### Step 2 : : Pre-processing

In [None]:
flatted_unprocessed_token_list = [element for innerList in unprocessed_token_lists for element in innerList]   

helper.compute_term_freq(flatted_unprocessed_token_list, True)

In [None]:
processed_token_lists = [element for innerList in processed_token_lists for element in innerList]   

helper.compute_term_freq(processed_token_lists, True, mabel)

In [None]:
print(f'Number of unprocessed tokens: {len(flatted_unprocessed_token_list)}\nNumber of processed tokens: {len(processed_token_lists)}')

#### Step 3 : : Models

1. n-grams
2. upvotes
3. sentiment analysis 
    - count method
        TODO: <sentiment / dates>
    - vader
        TODO: <sentiment / dates>
4. topic modelling
    - term doc freq
    - TDA
    - params


In [None]:
# TODO: n-grams 

In [None]:
# Sentiment analysis
count_sentiment_list = helper.sentiment_analysis('Count', omitb_df, True)

In [None]:
vader_sentiment_list = helper.sentiment_analysis('Vader', omitb_df, True)

In [None]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

In [None]:
# Topic modelling
num_topic = 10
max_word_count_to_display = 15
num_features = 1500

In [None]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(processed_token_lists)

tf_feature_names = tf_vectorizer.get_feature_names_out()

In [None]:
lda_model = LatentDirichletAllocation(n_components =num_topic, max_iter=10, learning_method='online').fit(tf)

#### Step 4 : : Analysis

1. Present what topics are being discussed eg: top-K terms, word-cloud etc. by **topic modelling** 
2. What are the topics, does it correspond to recent news etc

In [None]:
# Sentiment analysis
visualiser.generate_time_series(count_sentiment_list)

In [None]:
visualiser.generate_time_series(vader_sentiment_list)

In [None]:
visualiser.display_topics(lda_model, tf_feature_names, max_word_count_to_display)

In [None]:
# pyLDAvis
panel = pyLDAvis.lda_model.prepare(lda_model, tf, tf_vectorizer, mds='tsne')

pyLDAvis.display(panel)

In [None]:
# wordcloud
visualiser.display_word_cloud(lda_model, tf_feature_names)