### Inside the heads of Putnuts
#### An analysis of the Only Murders in the Building TV show subreddit

Milindi Kodikara

#### Step 0 : : Set up

In [None]:
from client import client
import helper
import charts


import praw
import string
import json
import codecs
import re

import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

from collections import Counter

# misc
from datetime import datetime
from pprint import pprint
from itertools import chain
import pandas as pd
from wordcloud import WordCloud

import pyLDAvis
import pyLDAvis.lda_model

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
subreddit_name = 'OnlyMurdersHulu'
# maximum number of hot submissions
# TODO: Reset to 100 for analysis
limit = 5

In [None]:
reddit_client = client()

In [None]:
# sanity check
print('Username :: ', reddit_client.user.me())

In [None]:
subreddit = reddit_client.subreddit(subreddit_name)

print('Subreddit :: ', subreddit)

In [None]:
posts = [*subreddit.hot(limit=limit)] 

len(posts)

In [None]:
oliver = '#275c4d'
mabel = '#af221d'
charles = '#c59103'

In [None]:
# TODO: Ask why use Tweet Tokenizer, why not word_tokenizer
tokeniser = TweetTokenizer()

# add punctuation to stopwords list
stop_words = stopwords.words('english') + list(string.punctuation) + ['rt', 'via', '...', '…', '"', "'", '`', '-']

In [None]:
unprocessed_token_lists = []
processed_token_lists = []

omitb_df = pd.DataFrame(columns=['Post', 'Num_comments', 'Author', 'UTC_Date', 'Date', 'Upvote_ratio', 'Unprocessed_tokens', 'Processed_tokens'])

for submission in subreddit.hot(limit=limit):
    
    post_description = submission.selftext
    post_title = submission.title
    post_title_description = post_title + " " + post_description
    
    utc_date = submission.created_utc
    post_date = datetime.fromtimestamp(submission.created_utc).strftime("%x")
    
    unprocessed_tokens = tokeniser.tokenize(post_title_description)
    unprocessed_token_lists.append(unprocessed_tokens)
    
    processed_tokens = helper.process(post_title_description, tokeniser, stop_words)
    # text, tokeniser, stop_words
    processed_token_lists.append(processed_tokens)
    
    submission.comments.replace_more(limit=None)
    for comment in submission.comments:
        comment_text = comment.body
        
        unprocessed_comment_tokens = tokeniser.tokenize(comment_text)
        unprocessed_tokens = unprocessed_tokens + unprocessed_comment_tokens
        unprocessed_token_lists.append(unprocessed_comment_tokens)
        
        processed_comment_tokens = helper.process(comment_text, tokeniser, stop_words)
        processed_tokens = processed_tokens + processed_comment_tokens
        processed_token_lists.append(processed_comment_tokens)
        
    omitb_df.loc[len(omitb_df.index)] = [post_title_description, submission.num_comments, submission.author.name, utc_date, post_date, submission.upvote_ratio, unprocessed_tokens, processed_tokens]

In [None]:
post = omitb_df.loc[[0]]

post

In [None]:
print('Post: {}\n---------\nAuthor: {}\n---------\nUpvote ratio: {}\n---------\nCreated date: {}\n---------'.format(post['Post'][0], post['Author'][0], post['Upvote_ratio'][0], post['Date'][0]))

In [None]:
post['Post'][0]

In [None]:
post['Processed_tokens'][0]

#### Step 1 : : Exploration

In [None]:
# Posts per date
num_posts_per_date_y = omitb_df.groupby('Date')['Post'].count().tolist()
dates_x = omitb_df['Date'].unique().tolist()

charts.generate_bar_chart(dates_x, num_posts_per_date_y, charles, 'Number of posts per date', 'Dates', 'Number of posts')

In [None]:
# Posts per author
num_posts_per_author_y = omitb_df.groupby('Author')['Post'].count().tolist()
author_x = omitb_df['Author'].unique().tolist()

charts.generate_bar_chart(author_x, num_posts_per_author_y, mabel, 'Number of posts per author', 'Author', 'Number of comments')

In [None]:
# Comments per date
num_comments_per_date_y = omitb_df.groupby('Date')['Num_comments'].sum().tolist()
date_x = omitb_df['Date'].unique().tolist()

charts.generate_bar_chart(date_x, num_comments_per_date_y, mabel, 'Number of comments per date', 'Dates', 'Number of comments')

In [None]:
# Comments vs upvote_ratio 
num_comments_y = omitb_df['Num_comments'].tolist()
upvote_ratio_x = omitb_df['Upvote_ratio'].tolist()

charts.generate_scatter_plot(upvote_ratio_x, num_comments_y, charles, 'Number of comments per upvote ratio', 'Upvote Ratio', 'Number of comments')


In [None]:
# Upvote_ration vs date
upvote_ratio_per_date_y = omitb_df.groupby('Date')['Upvote_ratio'].count().tolist()
date_x = omitb_df['Date'].unique().tolist()

charts.generate_bar_chart(date_x, upvote_ratio_per_date_y, oliver, 'Number of upvotes per date', 'Dates', 'Number of upvotes')

#### Step 2 : : Pre-processing

In [None]:
flatted_unprocessed_token_list = [element for innerList in unprocessed_token_lists for element in innerList]   

helper.compute_term_freq(flatted_unprocessed_token_list, True)

In [None]:
processed_token_lists = [element for innerList in processed_token_lists for element in innerList]   

helper.compute_term_freq(processed_token_lists, True, mabel)

#### Step 3 : : Models

1. n-grams
2. upvotes
3. sentiment analysis 
    - count method
        TODO: <sentiment / dates>
    - vader
        TODO: <sentiment / dates>
4. topic modelling
    - term doc freq
    - TDA
    - params


In [None]:
# TODO: n-grams 

In [None]:
# Sentiment analysis
count_sentiment_list = helper.sentiment_analysis('Count', omitb_df, True)

In [None]:
vader_sentiment_list = helper.sentiment_analysis('Vader', omitb_df, True)

In [None]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

In [None]:
# Topic modelling
num_topic = 10
max_word_count_to_display = 15
num_features = 1500

In [None]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(processed_token_lists)

tf_feature_names = tf_vectorizer.get_feature_names_out()

In [None]:
lda_model = LatentDirichletAllocation(n_components =num_topic, max_iter=10, learning_method='online').fit(tf)

#### Step 4 : : Analysis

1. Present what topics are being discussed eg: top-K terms, word-cloud etc. by **topic modelling** 
2. What are the topics, does it correspond to recent news etc

In [None]:
# Sentiment analysis
charts.generate_time_series(count_sentiment_list)

In [None]:
charts.generate_time_series(vader_sentiment_list)

In [None]:
helper.display_topics(lda_model, tf_feature_names, max_word_count_to_display)

In [None]:
# pyLDAvis
panel = pyLDAvis.lda_model.prepare(lda_model, tf, tf_vectorizer, mds='tsne')

pyLDAvis.display(panel)

In [None]:
# wordcloud
helper.display_word_cloud(lda_model, tf_feature_names)
