I accidentally only downloaded the reddit data from 2015 to 2018 March. It had to be up to the end of May. This notebook is me fixing this.

# Start Spark

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, count, rand, collect_list, explode, struct, count, lit
from pyspark.sql.functions import pandas_udf, PandasUDFType, from_unixtime, to_timestamp

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder\
                        .master("local[*]")\
                        .config('spark.executor.memory', '5g')\
                        .config('spark.driver.memory', '5g')\
                        .config("spark.sql.session.timeZone", "UTC")\
                        .config("spark.sql.execution.arrow.enabled","true")\
                    .appName("Sentiment Analysis")\
                    .getOrCreate()
spark

In [5]:
import pyarrow
from pyspark.sql.functions import udf

# Define Sentiment Function

In [6]:
from pyspark.sql.types import FloatType, StructType, StructField

schema = StructType([
    StructField("pos_vader", FloatType(), False),
    StructField("neg_vader", FloatType(), False),
    StructField("neu_vader", FloatType(), False),
    StructField("compound_vader", FloatType(), False),
    StructField("polarity_textblob", FloatType(), False),
    StructField("subjectivity_textblob", FloatType(), False)
])

@udf(schema)
def sentiment_scores(sentence):

  if sentence == None:
      return (None,None,None,None,None,None)
  
  sentence = str(sentence)
  
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
  from textblob import TextBlob

  textblob = TextBlob(sentence)

  polarity = textblob.sentiment.polarity
  subjectivity = textblob.sentiment.subjectivity
  
  analyzer = SentimentIntensityAnalyzer()
  vs = analyzer.polarity_scores(sentence)
  return (vs.get("pos"),vs.get("neg"),vs.get("neu"),vs.get("compound"),polarity,subjectivity)


# load missing data

In [7]:
missing_comments = "../../data/reddit-crypto/json/missing_reddit_comments"

In [8]:
m_comm_raw = spark.read.json(missing_comments)

In [9]:
m_comm_raw.limit(100).toPandas()

Unnamed: 0,archived,author,author_flair_css_class,author_flair_text,body,controversiality,created_utc,distinguished,gilded,id,link_id,parent_id,retrieved_on,score,score_hidden,subreddit,subreddit_id
0,False,172,,,Blocking me on Twitter allows Ver and @bitcoin...,0,1528504084,,0,e0cwxvi,t3_8pce9f,t1_e0b6hek,1532347264,0,False,btc,t5_2si5v
1,False,172,,,Based on the white paper I thought it was inte...,0,1527134742,,0,dzhhyyb,t3_8loeiv,t1_dzh9zho,1527892122,-7,False,btc,t5_2si5v
2,False,172,,,"I'm not referring to retweeting, obviously. Ju...",0,1528503900,,0,e0cws77,t3_8pg1vn,t1_e0bcn1j,1532347186,1,False,btc,t5_2si5v
3,False,172,,,The censorship there is definitely worse and I...,0,1528504758,,0,e0cxipn,t3_8pce9f,t1_e0b4sry,1532347551,1,False,btc,t5_2si5v
4,False,172,,,The price shot up very quickly then was froze ...,0,1522683950,,0,dwofgfp,t3_88z5dg,t1_dwo9m6l,1525866987,3,False,btc,t5_2si5v
5,False,172,,,How was it premined? If you have 1 btc 4 years...,0,1527210159,,0,dzj8mnm,t3_8loeiv,t1_dzhoyci,1527927655,1,False,btc,t5_2si5v
6,False,172,,,This is another one of the accounts that you c...,1,1530310948,,0,e1iv05z,t3_8us2kp,t3_8us2kp,1533076512,-2,False,btc,t5_2si5v
7,False,172,,,The market and everyone with a clue clearly be...,1,1523401116,,0,dx5kj94,t3_8b9e7c,t3_8b9e7c,1526186391,2,False,btc,t5_2si5v
8,False,172,,,It doesn't matter and I know that its a featur...,0,1528503478,,0,e0cwfds,t3_8pce9f,t1_e0b7lt6,1532346995,2,False,btc,t5_2si5v
9,False,172,,,It would be one thing for Ver to mute or block...,0,1528634100,,0,e0fgs09,t3_8pce9f,t1_e0eypdm,1532391894,1,False,btc,t5_2si5v


# standardise schema

In [10]:
bool_type = BooleanType()
int_type = IntegerType()
date_type = DateType()

m_comm = m_comm_raw.select(
"body",
col("score_hidden").cast(bool_type),
col("archived").cast(bool_type),
"author",
"author_flair_text", 
to_timestamp(from_unixtime("created_utc")).alias("created_utc"),
"subreddit_id",
"link_id",
"parent_id",
col("score").cast(int_type),
to_timestamp(from_unixtime("retrieved_on")).alias("retrieved_on"),    
col("controversiality").cast(int_type),
col("gilded").cast(int_type),
"id",
"subreddit",
"distinguished",
"author_flair_css_class",
)\
.withColumn("name",lit(None).cast("string"))\
.withColumn("ups",lit(None).cast("int"))\
.withColumn("downs",lit(None).cast("int"))\
.select("body",
"score_hidden",
"archived",
"name",
"author",
"author_flair_text",
"downs",
"created_utc",
"subreddit_id",
"link_id",
"parent_id",
"score",
"retrieved_on",
"controversiality",
"gilded",
"id",
"subreddit",
"ups",
"distinguished",
"author_flair_css_class")

m_comm.printSchema()

root
 |-- body: string (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- name: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- downs: integer (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- retrieved_on: timestamp (nullable = true)
 |-- controversiality: integer (nullable = true)
 |-- gilded: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- ups: integer (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)



# Apply UDF to data

In [11]:
result = m_comm.withColumn('sentiment',sentiment_scores(m_comm.body))

result = result.select(
"body",
"score_hidden",
"archived",
"name",
"author",
"author_flair_text",
"downs",
"created_utc",
"subreddit_id",
"link_id",
"parent_id",
"score",
"retrieved_on",
"controversiality",
"gilded",
"id",
"subreddit",
"ups",
"distinguished",
"author_flair_css_class",
"sentiment.pos_vader",
"sentiment.neg_vader",
"sentiment.neu_vader",
"sentiment.compound_vader",
"sentiment.polarity_textblob",
"sentiment.subjectivity_textblob"
)\

In [12]:
result.write.mode("overwrite").parquet("../../data/reddit-crypto/parquet/missing_sentiment")

# read the data for sanity

In [13]:
spark.read.parquet("../../data/reddit-crypto/parquet/missing_sentiment").limit(10).toPandas()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,...,subreddit,ups,distinguished,author_flair_css_class,pos_vader,neg_vader,neu_vader,compound_vader,polarity_textblob,subjectivity_textblob
0,Blocking me on Twitter allows Ver and @bitcoin...,False,False,,172,,,2018-06-09 00:28:04,t5_2si5v,t3_8pce9f,...,btc,,,,0.0,0.14,0.86,-0.7906,-0.0375,0.495833
1,Based on the white paper I thought it was inte...,False,False,,172,,,2018-05-24 04:05:42,t5_2si5v,t3_8loeiv,...,btc,,,,0.166,0.184,0.65,0.0139,0.25,0.25
2,"I'm not referring to retweeting, obviously. Ju...",False,False,,172,,,2018-06-09 00:25:00,t5_2si5v,t3_8pg1vn,...,btc,,,,0.0,0.062,0.938,-0.3907,-0.15625,0.34375
3,The censorship there is definitely worse and I...,False,False,,172,,,2018-06-09 00:39:18,t5_2si5v,t3_8pce9f,...,btc,,,,0.074,0.135,0.79,-0.8633,-0.202814,0.682792
4,The price shot up very quickly then was froze ...,False,False,,172,,,2018-04-02 15:45:50,t5_2si5v,t3_88z5dg,...,btc,,,,0.0,0.108,0.892,-0.8769,-0.028368,0.535833
5,How was it premined? If you have 1 btc 4 years...,False,False,,172,,,2018-05-25 01:02:39,t5_2si5v,t3_8loeiv,...,btc,,,,0.108,0.077,0.816,-0.2076,-0.3,0.95
6,This is another one of the accounts that you c...,False,False,,172,,,2018-06-29 22:22:28,t5_2si5v,t3_8us2kp,...,btc,,,,0.047,0.085,0.867,-0.594,0.377143,0.487143
7,The market and everyone with a clue clearly be...,False,False,,172,,,2018-04-10 22:58:36,t5_2si5v,t3_8b9e7c,...,btc,,,,0.145,0.0,0.855,0.5556,0.1,0.383333
8,It doesn't matter and I know that its a featur...,False,False,,172,,,2018-06-09 00:17:58,t5_2si5v,t3_8pce9f,...,btc,,,,0.073,0.106,0.821,-0.6691,0.106818,0.306899
9,It would be one thing for Ver to mute or block...,False,False,,172,,,2018-06-10 12:35:00,t5_2si5v,t3_8pce9f,...,btc,,,,0.087,0.17,0.743,-0.8702,0.0,0.35


# Merge data

In [14]:
s_comm = spark.read.parquet("../../data/reddit-crypto/parquet/missing_sentiment")

In [15]:
c_comm = spark.read.parquet("../../data/reddit-crypto/parquet/sentiment/")

In [16]:
s_comm.printSchema()

root
 |-- body: string (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- name: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- downs: integer (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- retrieved_on: timestamp (nullable = true)
 |-- controversiality: integer (nullable = true)
 |-- gilded: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- ups: integer (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- pos_vader: float (nullable = true)
 |-- neg_vader: float (nullable = true)
 |-- neu_vader: float (nullable = true)
 |-- compound_vader: float (nullable = 

In [17]:
c_comm.printSchema()

root
 |-- body: string (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- name: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- downs: integer (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- retrieved_on: timestamp (nullable = true)
 |-- controversiality: integer (nullable = true)
 |-- gilded: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- ups: integer (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- pos_vader: float (nullable = true)
 |-- neg_vader: float (nullable = true)
 |-- neu_vader: float (nullable = true)
 |-- compound_vader: float (nullable = 

In [18]:
c_new = c_comm.union(s_comm)

In [19]:
c_new.write.mode("overwrite").parquet("../../data/reddit-crypto/parquet/complete_sentiment")

In [20]:
appended_data = spark.read.parquet("../../data/reddit-crypto/parquet/complete_sentiment")

In [22]:
from pyspark.sql.functions import desc
appended_data.sort(desc("created_utc")).limit(5).toPandas()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,...,subreddit,ups,distinguished,author_flair_css_class,pos_vader,neg_vader,neu_vader,compound_vader,polarity_textblob,subjectivity_textblob
0,"I've only ever shipped from USA, so not sure a...",False,False,,hunk_quark,,,2018-06-30 23:59:56,t5_2si5v,t3_8v3vv4,...,btc,,,,0.0,0.138,0.862,-0.305,-0.125,0.944444
1,Bullshit.,False,False,,flamingboard,,,2018-06-30 23:59:33,t5_2si5v,t3_8v3sk5,...,btc,,,,0.0,1.0,0.0,-0.5859,0.0,0.0
2,It was never about Moons and Lambos. But the t...,False,False,,DawnPhantom,redditor for 3 months,,2018-06-30 23:59:24,t5_2s3qj,t3_8v5pbd,...,Bitcoin,,,noob,0.139,0.152,0.71,-0.2206,0.208333,0.487821
3,Makes no sense for me to pay sales tax and cap...,False,False,,_meowmix_5,redditor for 3 weeks,,2018-06-30 23:59:13,t5_2s3qj,t3_8umb1r,...,Bitcoin,,,noob,0.151,0.129,0.719,0.1531,0.5,0.5
4,A correction is when things go bonkers and the...,False,False,,Turil,,,2018-06-30 23:59:03,t5_2s3qj,t3_8v1p71,...,Bitcoin,,,,0.0,0.0,1.0,0.0,0.216667,0.383333
