#### \#MugLife

#### Step 0 : : Set up

In [None]:
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

from client import client
import pre_processing
import utils
import visualiser
import method

from praw.models import MoreComments

import string

import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
nltk.download('vader_lexicon')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.lda_model
import networkx as nx

import pandas as pd
from datetime import datetime
import ast

import os
from dotenv import load_dotenv
import subprocess
import sys

install("python-louvain")
load_dotenv()

%matplotlib inline

In [None]:
tokeniser = TweetTokenizer()
stemmer = nltk.stem.PorterStemmer()

# add punctuation to stopwords list
stop_words = stopwords.words('english') + list(string.punctuation) + ['rt', 'via', '...', '…', '"', "'", '`', '-', '..']

In [None]:
collected_posts = []
unprocessed_token_lists = []
processed_token_lists = []

posts_df = pd.DataFrame(columns=['social_media_id', 'title', 'utc_date', 'formatted_date', 'desc', 'author', 'rating','num_comments', 'unprocessed_tokens', 'processed_tokens'])

In [None]:
social_media_id = os.environ["SOCIAL-MEDIA-ID"]
social_media_id = social_media_id.lower()

collect_data_env = os.environ["COLLECT-DATA"]  

data_folder_path = os.environ["DATA-FOLDER-PATH"]

collect_data = True if collect_data_env == "True" else False

#### Step 1 : : Data collection

In [None]:
# Data collection from Reddit
data_sample_filepath = f'{data_folder_path}/data.csv'

if collect_data:
    if social_media_id == 'reddit':
        subreddit_names = 'tea+coffee+TeaPorn+pourover'
        
        reddit_client = client()
        subreddit = reddit_client.subreddit(subreddit_names)
        collected_posts = [*subreddit.top(limit=None)] 

#### Step 2 : : Pre-processing and Exploration


In [None]:
# Create dataframe containing reddit post details, unprocessed and pre-processed token lists
# This bit extracts the data from reddit and saves it to the data file 
if collect_data:
    reply_graph = nx.DiGraph()
    reply_graph_filepath = f'{data_folder_path}/{social_media_id}_reply_graph.graphml'
    
    # track the ids of post and comments for the reply graph
    post_comment_ids = dict()

    for post in collected_posts:
        post_id = post.name

        post_title = post.title
        post_description = post.selftext
        post_title_description = post_title + " " + post_description
        post_date = pd.to_datetime(datetime.fromtimestamp(post.created_utc).strftime("%d/%m/%Y"), format="%d/%m/%Y")

        unprocessed_tokens = tokeniser.tokenize(post_title_description)
        unprocessed_token_lists.append(unprocessed_tokens)

        processed_tokens = pre_processing.process(post_title_description, tokeniser, stemmer, stop_words, True)
        # text, tokeniser, stop_words
        processed_token_lists.append(processed_tokens)

        if post.author is None:
            post_author = 'None'
        else:
            post_author = post.author.name

        reply_graph = method.update_reply_graph_node(reply_graph, post_author)
        # Add the post id and the author to list of ids  
        post_comment_ids[post_id] = {post_id: post_author}

        post.comments.replace_more(limit=None)
        for comment in post.comments:
            if isinstance(comment, MoreComments):
                continue

            comment_text = comment.body if comment.body is None else ''

            unprocessed_comment_tokens = tokeniser.tokenize(comment_text)
            unprocessed_tokens = unprocessed_tokens + unprocessed_comment_tokens
            unprocessed_token_lists.append(unprocessed_comment_tokens)

            processed_comment_tokens = pre_processing.process(comment_text, tokeniser, stemmer, stop_words, False)
            processed_tokens = processed_tokens + processed_comment_tokens
            processed_token_lists.append(processed_comment_tokens)

            # Check if comment author exists
            comment_name = comment.name
            comment_author = comment.author
            if comment_author is not None and comment_author.name != 'ExternalUserError':
                comment_author_name = comment_author.name

                # Link the comment and comment author to the post id
                post_comment_ids[post_id].update({comment_name: comment_author_name})

                # Check whether parent comment is in the ids list  
                # If not, then parent comment has been deleted
                comment_parent_id = comment.parent_id
                if comment_parent_id in post_comment_ids[post_id]:
                    reply_graph = method.update_reply_graph_edge(reply_graph, comment_author_name, post_comment_ids,
                                                                 post_id, comment_parent_id)

        posts_df.loc[len(posts_df.index)] = [social_media_id, post_title, post.created_utc, post_date, post_description,
                                             post_author, post.upvote_ratio, post.num_comments, unprocessed_tokens,
                                             processed_tokens]
        
    # Save reply graph
    nx.readwrite.write_graphml(reply_graph, reply_graph_filepath)
    # Read old data file if it exists to append new data collected, if not save new file
    old_posts_df = pd.DataFrame(
        columns=['social_media_id', 'title', 'utc_date', 'formatted_date', 'desc', 'author', 'rating', 'num_comments',
                 'unprocessed_tokens', 'processed_tokens'])

    if os.path.isfile(data_sample_filepath):
        old_posts_df = pd.read_csv(data_sample_filepath, header=0)

        posts_df = pd.concat([old_posts_df, posts_df], ignore_index=True)

    posts_df.to_csv(data_sample_filepath, index=False, header=True)

len(posts_df)

In [None]:
# Read data from file
if not collect_data: 
    posts_df = pd.read_csv(data_sample_filepath)
    unprocessed_token_lists = posts_df.unprocessed_tokens.apply(lambda s: list(ast.literal_eval(s)))
    posts_df['unprocessed_tokens'] = unprocessed_token_lists
    processed_token_lists = posts_df.processed_tokens.apply(lambda s: list(ast.literal_eval(s)))
    posts_df['processed_tokens'] = processed_token_lists
    
    posts_df['formatted_date'] = pd.to_datetime(posts_df['formatted_date'], format="%Y-%m-%d")

posts_df

In [None]:
total_num_posts = len(posts_df)
print(f'Total number of posts: {total_num_posts}')

In [None]:
total_num_comments = posts_df['num_comments'].sum()
print(f'Total number of comments: {total_num_comments}')

In [None]:
total_data_items = total_num_posts + total_num_comments
print(f'Total data items: {total_data_items}')

In [None]:
df_social_medias = posts_df['social_media_id'].unique()
print(f'Social media data was collected from:\n{df_social_medias}')

In [None]:
flatted_unprocessed_token_list = [element for innerList in unprocessed_token_lists for element in innerList]   

visualiser.compute_term_freq(flatted_unprocessed_token_list, True)

In [None]:
processed_token_lists = [element for innerList in processed_token_lists for element in innerList]   

visualiser.compute_term_freq(processed_token_lists, True, utils.red)

#### Step 3 : : Method

Methods explored:
1. N-grams were explored to gain preliminary understanding of the sentiments in this subreddit
2. Sentiment analysis via N-grams, Count and Vader techniques 
3. Topic modelling via LDA topic model
4. Ego-graph
5. Reply graph
6. Community detection

In [None]:
# N-grams
top_50_bi_grams =  nltk.collocations.BigramCollocationFinder.from_words(processed_token_lists).ngram_fd.most_common(50)
top_50_tri_grams = nltk.collocations.TrigramCollocationFinder.from_words(processed_token_lists).ngram_fd.most_common(50)

In [None]:
# Sentiment analysis
count_sentiment_list = method.sentiment_analysis('Count', posts_df)

In [None]:
vader_sentiment_list = method.sentiment_analysis('Vader', posts_df)

In [None]:
# Topic modelling
num_topic = 10
num_features = 1500

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(processed_token_lists)

tf_feature_names = tf_vectorizer.get_feature_names_out()

lda_model = LatentDirichletAllocation(n_components=num_topic, max_iter=10, learning_method='online').fit(tf)
    

In [None]:
# Graphs and networks

# Egonet
# get the top author/s from the posts 
posts_df_by_rating = posts_df.sort_values(['rating', 'num_comments'], ascending=[False, False])
posts_df_by_rating_filtered = posts_df_by_rating[posts_df_by_rating['author'] != 'None']
subset_top_rated_authors_df = posts_df_by_rating_filtered.head(1)

print('------------Ego graph exploration------------\n')
ego_graph_list = []
for row in subset_top_rated_authors_df.itertuples():
    author_name = row.author   
    row_social_media_id = row.social_media_id
    
    print(utils.yellow_rgb + f'Social media id: {social_media_id}\n', end='')
    print(utils.yellow_rgb + f'Author name: {author_name}\nAuthor rating: {row.rating}\nAuthor comments: {row.num_comments}\n', end='')
    
    if row_social_media_id == 'reddit':
        if not collect_data:
            reddit_client = client()
        ego = reddit_client.redditor(author_name)
        ego_name = ego.name
        ego_graph = method.construct_ego_graph(reddit_client, ego, ego_name)
        ego_graph_list.append({'ego_graph': ego_graph, 'ego_name': ego_name})
        
        # Note: print_ego_graph does not depend on the social media used
        utils.print_ego_graph(data_folder_path, ego_graph, ego_name)
        
subset_top_rated_authors_df  

In [None]:
if 'reddit' in df_social_medias:
    reddit_reply_graph_filepath = f'{data_folder_path}/reddit_reply_graph.graphml'
    reddit_reply_graph = nx.readwrite.read_graphml(reddit_reply_graph_filepath)
    
    # Reply graph
    print('\n------------Reply graph exploration------------\n')
    method.compute_reply_graph_stats(reddit_reply_graph, data_folder_path, 'reddit')

In [None]:
if 'reddit' in df_social_medias:
    reddit_reply_graph_filepath = f'{data_folder_path}/reddit_reply_graph.graphml'
    reddit_reply_graph = nx.readwrite.read_graphml(reddit_reply_graph_filepath)
    
    # Create community
    print('\n------------Community graph exploration------------\n')
    method.compute_community_stats(reddit_reply_graph, data_folder_path, 'reddit')

#### Step 4 : : Analysis

Questions to explore:
1. Which is the superior beverage?
2. What are the most talked topics?
3. Which parts of the world favour which bev? What are their feelings and opinions?
4. Since we're in Melbourne, maybe a special look into Melbourne?
5. Spike in engagement of people with sales and deals; limited time events, world tea/coffee days, variation of engagement with change of season -- Event and correlations 
6. Origin of tea/ coffee
7. Benefits people get from tea/ coffee

In [None]:
# n-grams
top_50_bi_grams

In [None]:
top_50_tri_grams

In [None]:
# Posts per date
num_posts_per_date = posts_df.groupby('formatted_date')['title'].count()
visualiser.display_time_series_stats(num_posts_per_date, 'count', 'Number of posts per date', 'Dates', 'Number of posts', utils.red)

In [None]:
# Posts per author
# Displaying authors with only more than 1 post
num_posts_per_author = posts_df.groupby('author')['title'].count()

num_posts_per_author_ordered = num_posts_per_author.reset_index(name='count').sort_values(['count'], ascending=False)
print(f'Posts per author:\n{num_posts_per_author_ordered.head()}')

filtered_df = num_posts_per_author_ordered[num_posts_per_author_ordered['count'] > 5 ]
filtered_df = filtered_df[filtered_df['author'] != 'None']

num_posts_per_author_y = filtered_df['count']
author_x = filtered_df['author']
visualiser.generate_bar_chart(author_x, num_posts_per_author_y, utils.red, 'Number of posts per author', 'Author', 'Number of posts')

In [None]:
# Sentiment analysis
# Count
visualiser.generate_time_series(count_sentiment_list, 'Sentiment based on count', 'date', 'sentiment', 'Date', 'Count sentiment', utils.green)

In [None]:
# Vader
visualiser.generate_time_series(vader_sentiment_list, 'Sentiment based on vader', 'date', 'sentiment', 'Date', 'Veder sentiment', utils.green)

In [None]:
# Topic modelling
max_word_count_to_display = 15
visualiser.display_topics(lda_model, tf_feature_names, max_word_count_to_display)

In [None]:
# pyLDAvis
panel = pyLDAvis.lda_model.prepare(lda_model, tf, tf_vectorizer, mds='tsne')
pyLDAvis.enable_notebook()
pyLDAvis.display(panel)

In [None]:
# wordcloud
visualiser.display_word_cloud(lda_model, tf_feature_names)

In [None]:
# Display the ego graphs for the top users

for item in ego_graph_list:
    ego_graph = item.get('ego_graph')
    ego_name = item.get('ego_name')
    print(f'Ego name: {ego_name}\n\n')
    visualiser.display_networkx_graph(ego_graph, f'Ego graph for {ego_name}')

In [None]:
# Display reply graph

if 'reddit' in df_social_medias:
    reply_graph_filepath = f'{data_folder_path}/{social_media_id}_reply_graph.graphml'
    reply_graph = nx.readwrite.read_graphml(reply_graph_filepath)
    visualiser.display_networkx_graph(reply_graph, 'Reddit reply graph')
    