#### Da Hell?! 

#### Step 0 : : Set up

In [None]:
from client import client
import pre_processing
import utils
import visualiser

from praw.models import MoreComments

import string

import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
nltk.download('vader_lexicon')

import pandas as pd
from datetime import datetime
import ast

import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
tokeniser = TweetTokenizer()
stemmer = nltk.stem.PorterStemmer()

# add punctuation to stopwords list
stop_words = stopwords.words('english') + list(string.punctuation) + ['rt', 'via', '...', '…', '"', "'", '`', '-', '..']

In [None]:
collected_posts = []
unprocessed_token_lists = []
processed_token_lists = []

posts_df = pd.DataFrame(columns=['title', 'utc_date', 'formatted_date', 'desc', 'author', 'rating','num_comments', 'unprocessed_tokens', 'processed_tokens'])

In [None]:
collect_data = os.environ["COLLECT-DATA"]
data_filepath = os.environ["DATA-FILEPATH"]

collect_data = True if collect_data == "True" else False

#### Step 1 : : Data collection

In [None]:
# Data collection from Reddit
if collect_data:
    subreddit_names = 'tea+coffee+TeaPorn+pourover'
    
    reddit_client = client()
    subreddit = reddit_client.subreddit(subreddit_names)
    collected_posts = [*subreddit.top(limit=1)] 

#### Step 2 : : Pre-processing and Exploration


In [None]:
# Create dataframe containing reddit post details, unprocessed and pre-processed token lists
if collect_data:    
    for post in collected_posts:
        post_title = post.title
        post_description = post.selftext
        post_title_description = post_title + " " + post_description
        post_date = pd.to_datetime(datetime.fromtimestamp(post.created_utc).strftime("%d/%m/%Y"), dayfirst=True)
        
        unprocessed_tokens = tokeniser.tokenize(post_title_description)
        unprocessed_token_lists.append(unprocessed_tokens)
        
        processed_tokens = pre_processing.process(post_title_description, tokeniser, stemmer, stop_words, True)
        # text, tokeniser, stop_words
        processed_token_lists.append(processed_tokens)
        
        if post.author is None:
            post_author = 'None'
        else:
            post_author = post.author.name
            
        for comment in post.comments:
            if isinstance(comment, MoreComments):
                continue
    
            comment_text = comment.body if comment.body is None else ''
            
            unprocessed_comment_tokens = tokeniser.tokenize(comment_text)
            unprocessed_tokens = unprocessed_tokens + unprocessed_comment_tokens
            unprocessed_token_lists.append(unprocessed_comment_tokens)
            
            processed_comment_tokens = pre_processing.process(comment_text, tokeniser, stemmer, stop_words, False)
            processed_tokens = processed_tokens + processed_comment_tokens
            processed_token_lists.append(processed_comment_tokens)
            
        posts_df.loc[len(posts_df.index)] = [post_title, post.created_utc, post_date, post_description, post_author, post.upvote_ratio, post.num_comments, unprocessed_tokens, processed_tokens]
    
    # Read old data file if it exists to append new data collected, if not save new file
    old_posts_df = pd.DataFrame(columns=['title', 'utc_date', 'formatted_date', 'desc', 'author', 'rating','num_comments', 'unprocessed_tokens', 'processed_tokens'])
    if os.path.isfile(data_filepath):
        old_posts_df = pd.read_csv(data_filepath, header=0)

        posts_df = pd.concat([old_posts_df, posts_df], ignore_index=True)
    
    posts_df.to_csv(data_filepath, index=False, header=True)

len(posts_df)

In [None]:
# Read data from file
if not collect_data: 
    posts_df = pd.read_csv(data_filepath)
    unprocessed_token_lists = posts_df.unprocessed_tokens.apply(lambda s: list(ast.literal_eval(s)))
    processed_token_lists = posts_df.processed_tokens.apply(lambda s: list(ast.literal_eval(s)))

posts_df

In [None]:
total_num_posts = len(posts_df)
print(f'Total number of posts: {total_num_posts}')

In [None]:
total_num_comments = posts_df['num_comments'].sum()
print(f'Total number of comments: {total_num_comments}')

In [None]:
total_data_items = total_num_posts + total_num_comments
print(f'Total data items: {total_data_items}')

In [None]:
flatted_unprocessed_token_list = [element for innerList in unprocessed_token_lists for element in innerList]   

visualiser.compute_term_freq(flatted_unprocessed_token_list, True)

In [None]:
processed_token_lists = [element for innerList in processed_token_lists for element in innerList]   

visualiser.compute_term_freq(processed_token_lists, True, utils.red)

#### Step 3 : : Method

Methods explored:
1. Sentiment analysis
2. Topic Modelling
3. TBA

#### Step 4 : : Analysis

Questions to explore:
1. Which is the superior beverage?
2. What are the most talked topics?
3. Which parts of the world favour which bev? What are their feelings and opinions?
4. Since we're in Melbourne, maybe a special look into Melbourne?