#### \#MugLife

#### Step 0 : : Set up

In [None]:
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

from client import client
import utils
import visualiser
import method
import pre_processing

import nltk 
nltk.download('vader_lexicon')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.lda_model
import networkx as nx

import pandas as pd
from datetime import datetime
import ast
import plotly.express as px

import os
from dotenv import load_dotenv
import subprocess
import sys

install("python-louvain")
load_dotenv()

%matplotlib inline

In [None]:
print(f'Process start: {datetime.now()}')

In [None]:
# All posts
collected_posts = []

tea_unprocessed_token_lists = []
tea_processed_token_lists = []

coffee_unprocessed_token_lists = []
coffee_processed_token_lists = []

tea = 'tea'
coffee = 'coffee'
num_beverages = 2

posts_df = pd.DataFrame(columns=['social_media_id', 'post_type', 'title', 'utc_date', 'formatted_date', 'desc', 'author', 'rating','num_comments', 'unprocessed_tokens', 'processed_tokens'])

In [None]:
social_media_id = os.environ["SOCIAL-MEDIA-ID"]
social_media_id = social_media_id.lower()

collect_data_env = os.environ["COLLECT-DATA"]  
data_limit = os.environ["DATA-LIMIT"]

data_collection_limit = None
if data_limit != 'None':
    data_collection_limit =  int(data_limit)
data_folder_path = os.environ["DATA-FOLDER-PATH"]

collect_data = True if collect_data_env == "True" else False

#### Step 1 : : Data collection

In [None]:
# Data collection from Reddit
data_sample_filepath = f'{data_folder_path}/data.csv'

if collect_data:
    if social_media_id == 'reddit':
        subreddit_names = 'tea+coffee+TeaPorn+pourover'
        
        reddit_client = client()
        subreddit = reddit_client.subreddit(subreddit_names)
        collected_posts = [*subreddit.top(limit=data_collection_limit)] 

#### Step 2 : : Pre-processing and Exploration


In [None]:
# Create dataframe containing reddit post details, unprocessed and pre-processed token lists
# This bit extracts the data from reddit and saves it to the data file 
if collect_data:
    if social_media_id == 'reddit':
        tea_unprocessed_token_lists, coffee_unprocessed_token_lists, tea_processed_token_lists, coffee_processed_token_lists, posts_df = pre_processing.reddit_data_collection( 
        data_folder_path, collected_posts, data_sample_filepath)
    
len(posts_df)

In [None]:
# Read data from file
if not collect_data: 
    posts_df = pd.read_csv(data_sample_filepath)
    tea_unprocessed_token_lists = posts_df[posts_df['post_type'] == tea].unprocessed_tokens.apply(lambda s: list(ast.literal_eval(s)))
    posts_df[posts_df['post_type'] == tea]['unprocessed_tokens'] = tea_unprocessed_token_lists
    tea_unprocessed_token_lists = list(tea_unprocessed_token_lists)
    
    coffee_unprocessed_token_lists = posts_df[posts_df['post_type'] == coffee].unprocessed_tokens.apply(lambda s: list(ast.literal_eval(s)))
    posts_df[posts_df['post_type'] == coffee]['unprocessed_tokens'] = coffee_unprocessed_token_lists
    coffee_unprocessed_token_lists = list(coffee_unprocessed_token_lists)
    
    tea_processed_token_lists = posts_df[posts_df['post_type'] == tea].processed_tokens.apply(lambda s: list(ast.literal_eval(s)))
    posts_df[posts_df['post_type'] == tea]['processed_tokens'] = tea_processed_token_lists
    tea_processed_token_lists = list(tea_processed_token_lists)
    
    coffee_processed_token_lists = posts_df[posts_df['post_type'] == coffee].processed_tokens.apply(lambda s: list(ast.literal_eval(s)))
    posts_df[posts_df['post_type'] == coffee]['processed_tokens'] = coffee_processed_token_lists
    coffee_processed_token_lists = list(coffee_processed_token_lists)
    
    posts_df['formatted_date'] = pd.to_datetime(posts_df['formatted_date'], format="%Y-%m-%d")

posts_df

In [None]:
total_num_posts = len(posts_df)
print(f'Total number of posts: {total_num_posts}')

In [None]:
total_num_comments = posts_df['num_comments'].sum()
print(f'Total number of comments: {total_num_comments}')

In [None]:
total_data_items = total_num_posts + total_num_comments
print(f'Total data items: {total_data_items}')

In [None]:
tea_df = posts_df[posts_df['post_type'] == tea]

tea_df_count = len(tea_df)

print(f'Total tea posts: {tea_df_count}')

tea_df

In [None]:
coffee_df = posts_df[posts_df['post_type'] == coffee]

coffee_df_count = len(coffee_df)

print(f'Total coffee posts: {coffee_df_count}')

coffee_df

In [None]:
df_social_medias = posts_df['social_media_id'].unique()
print(f'Social media data was collected from:\n{df_social_medias}')

In [None]:
tea_unprocessed_token_lists

In [None]:
tea_processed_token_lists

In [None]:
coffee_unprocessed_token_lists

In [None]:
coffee_processed_token_lists

In [None]:
tea_flatted_unprocessed_token_list = [element for innerList in tea_unprocessed_token_lists for element in innerList]   

coffee_flatted_unprocessed_token_list = [element for innerList in coffee_unprocessed_token_lists for element in innerList]   

visualiser.compute_term_freq(tea, tea_flatted_unprocessed_token_list, True)
visualiser.compute_term_freq(coffee, coffee_flatted_unprocessed_token_list, True, utils.red)

In [None]:
tea_processed_token_lists = [element for innerList in tea_processed_token_lists for element in innerList]   
coffee_processed_token_lists = [element for innerList in coffee_processed_token_lists for element in innerList]   

visualiser.compute_term_freq(tea, tea_processed_token_lists, True)
visualiser.compute_term_freq(coffee, coffee_processed_token_lists, True, utils.red)

#### Step 3 : : Method

Methods explored:
1. N-grams were explored to gain preliminary understanding of the sentiments in this subreddit
2. Sentiment analysis via N-grams, Count and Vader techniques 
3. Topic modelling via LDA topic model
4. Ego-graph
5. Reply graph
6. Community detection

In [None]:
# N-grams

# Tea
tea_top_50_bi_grams =  nltk.collocations.BigramCollocationFinder.from_words(tea_processed_token_lists).ngram_fd.most_common(50)
tea_top_50_tri_grams = nltk.collocations.TrigramCollocationFinder.from_words(tea_processed_token_lists).ngram_fd.most_common(50)

In [None]:
# Coffee
coffee_top_50_bi_grams =  nltk.collocations.BigramCollocationFinder.from_words(coffee_processed_token_lists).ngram_fd.most_common(50)
coffee_top_50_tri_grams = nltk.collocations.TrigramCollocationFinder.from_words(coffee_processed_token_lists).ngram_fd.most_common(50)

In [None]:
# Sentiment analysis
tea_count_sentiment_list = method.sentiment_analysis('Count', tea_df)

In [None]:
coffee_count_sentiment_list = method.sentiment_analysis('Count', coffee_df)

In [None]:
tea_vader_sentiment_list = method.sentiment_analysis('Vader', tea_df)

In [None]:
coffee_vader_sentiment_list = method.sentiment_analysis('Vader', coffee_df)

In [None]:
# Topic modelling
num_topic = 10
num_features = 1500

In [None]:
tea_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')

tea_tf = tea_tf_vectorizer.fit_transform(tea_processed_token_lists)
tea_tf_feature_names = tea_tf_vectorizer.get_feature_names_out()

tea_lda_model = LatentDirichletAllocation(n_components=num_topic, max_iter=10, learning_method='online').fit(tea_tf)

In [None]:
coffee_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')

coffee_tf = coffee_tf_vectorizer.fit_transform(coffee_processed_token_lists)

coffee_tf_feature_names = coffee_tf_vectorizer.get_feature_names_out()

coffee_lda_model = LatentDirichletAllocation(n_components=num_topic, max_iter=10, learning_method='online').fit(coffee_tf)

In [None]:
# Graphs and networks

# Egonet
# get the top author/s from the posts 
ego_graph_list = []
for beverage_type_index in range(num_beverages):
    beverage_df = tea_df
    beverage_type = tea
    if beverage_type_index == 1:
        beverage_df = coffee_df
        beverage_type = coffee
        
    beverage_df_by_rating = beverage_df.sort_values(['rating', 'num_comments'], ascending=[False, False])
    beverage_df_by_rating_filtered = beverage_df_by_rating[beverage_df_by_rating['author'] != 'None']
    subset_top_rated_authors_df = beverage_df_by_rating_filtered.head(1)
    
    print(f'------------Ego graph exploration for {beverage_type}------------\n')
    for row in subset_top_rated_authors_df.itertuples():
        author_name = row.author   
        row_social_media_id = row.social_media_id
        
        print(utils.yellow_rgb + f'Social media id: {social_media_id}\n', end='')
        print(utils.yellow_rgb + f'Author name: {author_name}\nAuthor rating: {row.rating}\nAuthor comments: {row.num_comments}\n', end='')
        
        # if ego graph exists load from file else, create the graph
        ego_graph_filepath = f'{data_folder_path}/{beverage_type}_ego_{author_name}.graphml'
        if row_social_media_id == 'reddit':
            if not collect_data:
                reddit_client = client()
            ego = reddit_client.redditor(author_name)
            ego_name = ego.name
            if os.path.isfile(ego_graph_filepath):
                ego_graph = nx.readwrite.read_graphml(ego_graph_filepath)
            else:
                ego_graph = method.construct_ego_graph(reddit_client, ego, ego_name, ego_graph_filepath)
            ego_graph_list.append({'ego_graph': ego_graph, 'ego_name': ego_name})
        
            # Note: print_ego_graph does not depend on the social media used
            utils.print_ego_graph_stats(ego_graph, ego_name)  

In [None]:
# Centrality of reply graphs
if 'reddit' in df_social_medias:
    tea_reddit_reply_graph_filepath = f'{data_folder_path}/reddit_tea_reply_graph.graphml'
    tea_reddit_reply_graph = nx.readwrite.read_graphml(tea_reddit_reply_graph_filepath)
    
    coffee_reddit_reply_graph_filepath = f'{data_folder_path}/reddit_coffee_reply_graph.graphml'
    coffee_reddit_reply_graph = nx.readwrite.read_graphml(coffee_reddit_reply_graph_filepath)
    
    # Reply graph
    print('\n------------Reply graph exploration------------\n')
    method.compute_reply_graph_stats(tea_reddit_reply_graph, data_folder_path, 'reddit', tea, utils.green)
    method.compute_reply_graph_stats(coffee_reddit_reply_graph, data_folder_path, 'reddit', coffee, utils.red)


In [None]:
# Community detection
if 'reddit' in df_social_medias:
    
    tea_reddit_reply_graph_filepath = f'{data_folder_path}/reddit_tea_reply_graph.graphml'
    tea_reddit_reply_graph = nx.readwrite.read_graphml(tea_reddit_reply_graph_filepath)
    
    coffee_reddit_reply_graph_filepath = f'{data_folder_path}/reddit_coffee_reply_graph.graphml'
    coffee_reddit_reply_graph = nx.readwrite.read_graphml(coffee_reddit_reply_graph_filepath)
    
    # Create community
    print('\n------------Community graph exploration------------\n')
    method.compute_community_stats(tea_reddit_reply_graph, data_folder_path, 'reddit', tea)
    method.compute_community_stats(coffee_reddit_reply_graph, data_folder_path, 'reddit', coffee)


In [None]:
# Linear threshold model for influence modelling
seed_num = 3
list_of_seeds = [0,1]
trial_num = 10

In [None]:
# Explore reddit users' communities
tea_authors_df = utils.get_author_df(tea, tea_df, reddit_client)

tea_authors_df

In [None]:
coffee_authors_df = utils.get_author_df(coffee, coffee_df, reddit_client)

coffee_authors_df

#### Step 4 : : Analysis

Questions to explore:
1. Which is the superior beverage?
2. What are the most talked topics?
3. Which parts of the world favour which bev? What are their feelings and opinions?
4. Since we're in Melbourne, maybe a special look into Melbourne?
5. Spike in engagement of people with sales and deals; limited time events, world tea/coffee days, variation of engagement with change of season -- Event and correlations 
6. Origin of tea/ coffee
7. Benefits people get from tea/ coffee

In [None]:
# n-grams
tea_top_50_bi_grams

In [None]:
tea_top_50_tri_grams

In [None]:
coffee_top_50_bi_grams

In [None]:
coffee_top_50_tri_grams

In [None]:
# Posts per date
tea_num_posts_per_date = tea_df.groupby('formatted_date')['title'].count()
coffee_num_posts_per_date = coffee_df.groupby('formatted_date')['title'].count()

visualiser.display_time_series_stats(tea_num_posts_per_date, 'count', 'Number of posts per date for tea dataset', 'Dates', 'Number of posts', utils.green)
visualiser.display_time_series_stats(coffee_num_posts_per_date, 'count', 'Number of posts per date for coffee dataset', 'Dates', 'Number of posts', utils.red)

In [None]:
# Posts per author
# Displaying authors with only more than 1 post
def display_posts_per_author(df, beverage_type, graph_colour):
    num_posts_per_author = df.groupby('author')['title'].count()
    
    num_posts_per_author_ordered = num_posts_per_author.reset_index(name='count').sort_values(['count'], ascending=False)
    print(f'Posts per author:\n{num_posts_per_author_ordered.head()}')
    
    filtered_df = num_posts_per_author_ordered[num_posts_per_author_ordered['count'] > 5 ]
    filtered_df = filtered_df[filtered_df['author'] != 'None']
    
    num_posts_per_author_y = filtered_df['count']
    author_x = filtered_df['author']
    visualiser.generate_bar_chart(author_x, num_posts_per_author_y, graph_colour, f'Number of posts per author for {beverage_type} dataset', 'Author', 'Number of posts')

display_posts_per_author(tea_df, tea, utils.green)
display_posts_per_author(coffee_df, coffee, utils.yellow)

In [None]:
# Author influence on other subreddits
if not tea_authors_df.empty: visualiser.display_author_influence(tea_authors_df, tea)

if not coffee_authors_df.empty: visualiser.display_author_influence(coffee_authors_df, coffee)

In [None]:
# Tea and Coffee frequency by bean/leaf type 

unique_tea_leaves = ['black', 'green', 'white', 'yellow', 'oolong', 'dark']
visualiser.generate_frequency_graph(unique_tea_leaves, tea_processed_token_lists, 'Leaf types', utils.green)

In [None]:
unique_coffee_beans = ['arabica', 'robusta', 'excelsa', 'liberica']
visualiser.generate_frequency_graph(unique_coffee_beans, coffee_processed_token_lists, 'Bean types', utils.yellow)

In [None]:
# Tea and Coffee types
unique_tea_styles = ['chai', 'thai', 'kashmiri', 'bubble', 'masala', 'milk', 'matcha', 'earl', 'ginger', 'pu', 'po', 'sweet', 'teh', 'cha', 'hojicha', 'yen', 'touareg']

visualiser.generate_frequency_graph(unique_tea_styles, tea_processed_token_lists, 'Tea styles', utils.green)

unique_coffee_styles = ['mocha', 'latte', 'long', 'double', 'short', 'espresso', 'macchiato', 'ristretto', 'cappuccino', 'irish', 'affogato', 'martini', 'decaf', 'americano', 'iced coffee']

visualiser.generate_frequency_graph(unique_coffee_styles, coffee_processed_token_lists, 'Coffee styles', utils.yellow)

In [None]:
# Tea and Coffee making styles
unique_tea_brews = ['infusion', 'cold', 'gong', 'press']
visualiser.generate_frequency_graph(unique_tea_brews, tea_processed_token_lists, 'Tea brews', utils.green)

unique_coffee_brews = ['espresso', 'filter', 'press']
visualiser.generate_frequency_graph(unique_coffee_brews, coffee_processed_token_lists, 'Coffee brews', utils.yellow)

In [None]:
# Tea and Coffee frequency by origin

tea_origin = ['china', 'india', 'kenya', 'lanka', 'ceylon', 'turkey', 'vietnam', 'indonesia', 'bangladesh', 'argentina', 'uganda']
visualiser.generate_frequency_graph(tea_origin, tea_processed_token_lists, 'Tea origin', utils.green)

coffee_origin = ['brazil', 'vietnam', 'indonesia', 'colombia', 'ethiopia', 'honduras', 'peru', 'india', 'kenya']
visualiser.generate_frequency_graph(coffee_origin, coffee_processed_token_lists, 'Coffee origin', utils.yellow)


In [None]:
visualiser.create_world_map(tea_origin, tea_processed_token_lists, tea)

In [None]:
visualiser.create_world_map(coffee_origin, coffee_processed_token_lists, coffee)

In [None]:
# popular countries 
all_countries = px.data.gapminder().query("year==2007")['country']
all_countries = [country.lower() for country in all_countries]
all_countries.append('lanka')
all_countries.append('ceylon')

visualiser.create_world_map(all_countries, tea_processed_token_lists, 'all_tea')

visualiser.create_world_map(all_countries, coffee_processed_token_lists, 'all_coffee')

In [None]:
# Sentiment analysis
# Count
visualiser.generate_time_series(tea_count_sentiment_list, 'Sentiment based on count for tea dataset', 'date', 'sentiment', 'Date', 'Count sentiment', utils.green)

visualiser.generate_time_series(coffee_count_sentiment_list, 'Sentiment based on count for coffee dataset', 'date', 'sentiment', 'Date', 'Count sentiment', utils.red)

In [None]:
# Vader
visualiser.generate_time_series(tea_vader_sentiment_list, 'Sentiment based on vader for tea dataset', 'date', 'sentiment', 'Date', 'Vader sentiment', utils.green)

visualiser.generate_time_series(coffee_vader_sentiment_list, 'Sentiment based on vader for coffee dataset', 'date', 'sentiment', 'Date', 'Vader sentiment', utils.red)

In [None]:
# Topic modelling
def display_topic_model(beverage_type_for_topic_modelling):
    if beverage_type_for_topic_modelling == tea:
        current_lda_model = tea_lda_model
        current_tf_feature_names = tea_tf_feature_names
        current_tf = tea_tf
        current_tf_vectorizer = tea_tf_vectorizer
    else:
        current_lda_model = coffee_lda_model
        current_tf_feature_names = coffee_tf_feature_names
        current_tf = coffee_tf
        current_tf_vectorizer = coffee_tf_vectorizer
    
    max_word_count_to_display = 15
    visualiser.display_topics(current_lda_model, current_tf_feature_names, max_word_count_to_display)
    
    panel = pyLDAvis.lda_model.prepare(current_lda_model, current_tf, current_tf_vectorizer, mds='tsne')
    pyLDAvis.enable_notebook()
    return panel

In [None]:
# pyLDAvis
tea_panel = display_topic_model(tea)
pyLDAvis.display(tea_panel)

In [None]:
coffee_panel = display_topic_model(coffee)
pyLDAvis.display(coffee_panel)

In [None]:
# wordcloud
visualiser.display_word_cloud(tea_lda_model, tea_tf_feature_names)

In [None]:
visualiser.display_word_cloud(coffee_lda_model, coffee_tf_feature_names)

In [None]:
# Display the ego graphs for the top users

for item in ego_graph_list:
    ego_graph = item.get('ego_graph')
    ego_name = item.get('ego_name')
    print(f'Ego name: {ego_name}\n\n')
    visualiser.display_networkx_graph(ego_graph, f'Ego graph for {ego_name}')

In [None]:
# Author influence graph
if not tea_authors_df.empty:
    u_tea_authors = utils.get_unique_authors(tea_authors_df, tea)
    visualiser.author_influence_graph(tea_authors_df, u_tea_authors)

if not coffee_authors_df.empty:
    u_coffee_authors = utils.get_unique_authors(coffee_authors_df, coffee)
    visualiser.author_influence_graph(coffee_authors_df, u_coffee_authors)

In [None]:
# Reply graph types 
type_reply = 'reply'
type_centrality = 'centrality'
type_community = 'community'

graph_types = [type_reply, type_centrality, type_community]

In [None]:
for graph_type in graph_types:
    for beverage_type_index in range(num_beverages):
        beverage_type = tea
        if beverage_type_index == 1:
            beverage_type = coffee

        print(f'\nBeverage type: {beverage_type}\nGraph type: {graph_type}\n')
        prefix_filepath = f'{data_folder_path}/{social_media_id}_{beverage_type}_{graph_type}'

        graph_filepath = f'{prefix_filepath}_graph.graphml'
        loaded_graph = nx.readwrite.read_graphml(graph_filepath)

        print('\nTree graph\n')
        visualiser.display_tree_graph(trial_num, list_of_seeds, loaded_graph, prefix_filepath)

        # print('\nSmall world graph\n')
        # visualiser.display_barabasi_albert_graph(trial_num, list_of_seeds, loaded_graph, prefix_filepath)

        print('\nLinear threshold stats\n')
        visualiser.display_linear_threshold_stats(trial_num, list_of_seeds, loaded_graph, prefix_filepath)

In [None]:
print(f'Process end: {datetime.now()}')