In [1]:
#core libraries
import json
import pandas as pd
import numpy as np
from datetime import datetime, timezone
import sys

#vader
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import ipywidgets as widgets
from ipywidgets import *
from ipywidgets import interact, interact_manual
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns

import re
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
import demoji
demoji.download_codes()

#Display Setting
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

#detect language
from textblob import TextBlob  

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ace\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ace\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading emoji data ...
... OK (Got response in 1.03 seconds)
Writing emoji data to C:\Users\Ace\.demoji\codes.json ...
... OK


In [2]:
wsb_df = pd.read_csv("submissions_full.csv")

In [3]:
wsb_df['created_utc'] = pd.to_datetime(wsb_df['created_utc'])
wsb_df['created_utc'] = wsb_df['created_utc'].dt.strftime('%Y-%m-%d')

wsb_df.sort_values(by="created_utc",inplace=True)

In [4]:
data = wsb_df.copy()

In [5]:
## Functions
## Useful functions
def seconds_time(s):
    seconds_set = " "+str(s%60)+"sec" if s%60!=0 else ""
    s = s//60
    
    mins_set = " "+str(s%60)+"min" if s%60!=0 else ""
    s = s//60
    
    hrs_set = " "+str(s%24)+"hr" if s%24!=0 else ""
    s = s//24
    
    s = str(s)+"days" if s!=0 else ""
    
    return s+hrs_set+mins_set+seconds_set
 
import pyspark.sql.functions as F
from pyspark.sql.types import *
 
## Functions
def language_detection(text):
    return TextBlob(str(text)).detect_language()
 
#Pre-Processing
 
def remove_numbers(text):
    return re.sub(r'\d+', '', text)
 
def remove_specials(text):
    text = text.replace("[^a-zA-Z#]", " ")
    return re.sub(r'[^a-zA-Z0-9 ]',r'',text)
 
def remove_whitespace(text):
    return " ".join(text.split())
 
def remove_punctuations(text):
    words = nltk.word_tokenize(text)
    punt_removed = [w for w in words if w.lower() not in string.punctuation]
    return " ".join(punt_removed)
 
def remove_stopwords(text, lang='english'):
    words = nltk.word_tokenize(text)
    lang_stopwords = stopwords.words(lang)
    stopwords_removed = [w for w in words if w.lower() not in lang_stopwords]
    return " ".join(stopwords_removed)
 
def remove_all(text):
    if text is None:
        return ''
    elif text == '[removed]':
        return ''
    elif len(text) <= 0:
        return ''
    else:
        text = remove_numbers(text)
        text = remove_specials(text)
        text = remove_whitespace(text)
        text = remove_punctuations(text)
        return text

# Auxiliar functions
from pyspark.sql.types import *
def equivalent_type(f):
    if f == 'datetime64[ns]': return TimestampType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)
 
# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df):
    from pyspark.sql import SQLContext
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sqlContext.createDataFrame(pandas_df, p_schema)

In [6]:
data['title'] = data['title'].apply(
        lambda x: remove_all(x)
)

def emoji_to_words(text):
    words = nltk.word_tokenize(text)
    for i in range(len(words)):
        a = list(demoji.findall(words[i]).values())
        try:
            words[i] = a[0]
        except:
            pass
    return " ".join(words)

data['title'] = data['title'].apply(
        lambda x: emoji_to_words(x)
)

In [7]:
# VADER - daily positive/negative sentiment
def vader_sentiment(title):
    if title == '':
        return 0.0
    else:
        analyser = SentimentIntensityAnalyzer()
        sentiment_score=0.0
        try:
            sentiment_score=sentiment_score+analyser.polarity_scores(title)['compound']
        except TypeError:
            sentiment_score=0.0
        return sentiment_score

def vader_sentiment_indicator(title):
    if title == '':
        return 0
    else:
        analyser = SentimentIntensityAnalyzer()
        sentiment_score=0.0
        try:
            sentiment_score=sentiment_score+analyser.polarity_scores(title)['compound']
            sentiment_score = 1
        except TypeError:
            sentiment_score=0
        return sentiment_score

def mean_list(data):
    data = [i for i in data if i is not None]
    if len(data) <= 0:
        return 0.0
    return np.mean(data)

In [8]:
#VADER - daily positive/negative sentiment
data['sentiment_score'] = data['title'].apply(
        lambda x: vader_sentiment(x)
)

In [9]:
#BULLISH/BEARISH SCORE - Sentiment Analysis Using Keywords
#creating a list of words indicating bullish trend and  list of words indicating bearish trend
bull_words = ['call', 'long', 'all in', 'moon', 'going up', 
             'rocket', 'buy', 'long term', 'green',
             'to the moon','doubling down','dd',
             'tendies','yolo','paper hands','jpow'
             'andromeda','rocket ships','yoloed','rocket']

bear_words = ['put', 'short', 'going down', 
             'drop', 'bear', 'sell', 'red',
             'guh','stonks','diamond hands',
             'btfd','bag holder','hold the line']

In [10]:
def calculate_score_bullbear(text, word_list):
    score = 0.0
    for word in word_list:
        if word in text:
            score += 1.0
    return score

In [11]:
data['bull_scores'] = data['title'].apply(
        lambda x: calculate_score_bullbear(x, bull_words)
)
data['bear_scores'] = data['title'].apply(
        lambda x: calculate_score_bullbear(x, bear_words)
)

In [12]:
def date_convert(date):
    try:
        final_date = pd.to_datetime(date, errors='coerce').strftime('%Y-%m-%d')
    except:
        final_date = None
    return final_date
data['Date'] = data['created_utc'].apply(
        lambda x: date_convert(x)
)

In [13]:
data = data.sort_values(['subreddit','Date'], ascending=[True,True]).reset_index(drop=True)

In [14]:
data_group = data.groupby(["Date",'subreddit']) \
    .agg({"score":'mean',
          "sentiment_score":'mean',
          "bull_scores":'mean',
          "bear_scores":'mean',
          "num_comments":"sum",
          "num_crossposts":"sum"
         }).reset_index()
data_group['Date'] = data_group['Date'].map({
    date_i: pd.to_datetime(date_i) for date_i in data_group['Date'].unique()
})

In [20]:
data_group

Unnamed: 0,Date,subreddit,score,sentiment_score,bull_scores,bear_scores,num_comments,num_crossposts
0,2019-01-01,RobinHood,1.000000,0.014529,0.117647,0.117647,155,0
1,2019-01-01,SecurityAnalysis,1.000000,-0.087800,0.000000,0.500000,19,0
2,2019-01-01,investing,1.000000,0.050425,0.096154,0.096154,948,0
3,2019-01-01,stocks,1.000000,0.014819,0.095238,0.095238,262,0
4,2019-01-01,wallstreetbets,1.000000,0.043971,0.130000,0.170000,2475,0
...,...,...,...,...,...,...,...,...
3927,2021-02-28,RobinHood,0.986111,0.052422,0.236111,0.250000,12,0
3928,2021-02-28,SecurityAnalysis,56.666667,-0.025517,0.000000,0.000000,16,1
3929,2021-02-28,investing,34.500000,0.068067,0.100000,0.040000,997,0
3930,2021-02-28,stocks,50.553571,0.026812,0.089286,0.125000,2020,7


In [19]:
data_group.to_csv("Sentiment_Data_Subredditts.csv", index=False)

In [16]:
df_full_group = data.groupby(["Date"]) \
    .agg({"score":'mean',
          "sentiment_score":'mean',
          "bull_scores":'mean',
          "bear_scores":'mean',
          "num_comments":"sum",
          "num_crossposts":"sum"
         }).reset_index()

In [17]:
df_full_group

Unnamed: 0,Date,score,sentiment_score,bull_scores,bear_scores,num_comments,num_crossposts
0,2019-01-01,1.000000,0.038551,0.114583,0.140625,3859,0
1,2019-01-02,1.000000,0.087764,0.040179,0.093750,5218,0
2,2019-01-03,1.000000,0.045296,0.114198,0.117284,9197,0
3,2019-01-04,1.000000,0.044735,0.103152,0.111748,10356,0
4,2019-01-05,1.000000,0.029266,0.103679,0.137124,11137,0
...,...,...,...,...,...,...,...
782,2021-02-24,107.942598,0.077047,0.145015,0.093656,10524,10
783,2021-02-25,68.073239,0.070610,0.149296,0.104225,29657,38
784,2021-02-26,20.563014,0.051488,0.145205,0.098630,18596,17
785,2021-02-27,93.230065,0.075208,0.117647,0.108497,210012,30


In [18]:
df_full_group.to_csv("Sentiment_Data.csv", index=False)