In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
                        .master("local[8]")\
                        .config('spark.executor.memory', '5g')\
                        .config('spark.driver.memory', '5g')\
                        .appName("Tweet wrangeling")\
                        .getOrCreate()

# .config('spark.local.dir','~/.spark_tmp/')\

In [2]:
# manaully download reddit comments from google big table

In [4]:
reddit_csv_path = "../../data/reddit-crypto/json/"
reddit_comments_raw = spark.read.json(reddit_csv_path)

In [5]:
reddit_comments_raw.printSchema()

root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: string (nullable = true)
 |-- gilded: string (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: string (nullable = true)
 |-- score: string (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: string (nullable = true)



In [6]:
reddit_comments_raw.limit(5).toPandas()

Unnamed: 0,archived,author,author_flair_css_class,author_flair_text,body,controversiality,created_utc,distinguished,downs,gilded,id,link_id,name,parent_id,retrieved_on,score,score_hidden,subreddit,subreddit_id,ups
0,True,Old-and-grumpy,,,I worked for Microsoft once. Many of my friend...,0,1391833462,,0,0,cf9ys6w,t3_1xacqf,t1_cf9ys6w,t3_1xacqf,1432304512,-2,False,Bitcoin,t5_2s3qj,-2
1,True,slowmoon,,,"Agreed 100%. As much as I support bitcoin, I w...",0,1403620927,,0,0,cifrcra,t3_28xz4v,t1_cifrcra,t1_cifpkhw,1434380914,9,False,Bitcoin,t5_2s3qj,9
2,True,slimmtl,,,it seems inherent to the idea of bitcoin/minin...,0,1395752758,,0,0,cgbewr2,t3_21b4pm,t1_cgbewr2,t3_21b4pm,1433059780,12,False,Bitcoin,t5_2s3qj,12
3,True,gonzobon,,,No. I just think that buttcoin has their own n...,0,1394065595,,0,0,cfvfbgr,t3_1zo631,t1_cfvfbgr,t1_cfvf4mv,1432675849,-5,False,Bitcoin,t5_2s3qj,-5
4,True,Rassah,,,What?! Why? I just relay the information. Andr...,0,1397664079,,0,0,cgtzbmv,t3_236k5d,t1_cgtzbmv,t1_cgty4n6,1433381754,9,False,Bitcoin,t5_2s3qj,9


In [13]:
from pyspark.sql.functions import col, from_unixtime, to_timestamp
from pyspark.sql.types import DateType, BooleanType, IntegerType

bool_type = BooleanType()
int_type = IntegerType()
date_type = DateType()

reddit_comments = reddit_comments_raw.select(
"body",
col("score_hidden").cast(bool_type),
col("archived").cast(bool_type),
"name",
"author",
"author_flair_text", 
col("downs").cast(int_type),
to_timestamp(from_unixtime("created_utc")).alias("created_utc"),
"subreddit_id",
"link_id",
"parent_id",
col("score").cast(int_type),
to_timestamp(from_unixtime("retrieved_on")).alias("retrieved_on"),    
col("controversiality").cast(int_type),
col("gilded").cast(int_type),
"id",
"subreddit",
col("ups").cast(int_type),
"distinguished",
"author_flair_css_class",
)

reddit_comments.printSchema()

root
 |-- body: string (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- name: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- downs: integer (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- retrieved_on: timestamp (nullable = true)
 |-- controversiality: integer (nullable = true)
 |-- gilded: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- ups: integer (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)



In [14]:
import pandas as pd
pd.set_option("max_columns",100)
pd.set_option("max_colwidth",300)
reddit_comments.limit(5).toPandas()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
0,I worked for Microsoft once. Many of my friends are still there. We all have iPhones or Androids for one reason. Apps. Microsoft will continue to consolidate in markets where it sucks and double down in markets where it doesn't. It sucks in mobile - which is just so goddamn tragic considering th...,False,True,t1_cf9ys6w,Old-and-grumpy,,0,2014-02-08 05:24:22,t5_2s3qj,t3_1xacqf,t3_1xacqf,-2,2015-05-22 16:21:52,0,0,cf9ys6w,Bitcoin,-2,,
1,"Agreed 100%. As much as I support bitcoin, I will ditch it if it stops being what it's supposed to be. I'm not going to support any currency that is even compatible with any ""coin validation"" scheme. I'm not going to support any currency that can be taken over by one or two mining pools. So eith...",False,True,t1_cifrcra,slowmoon,,0,2014-06-24 16:42:07,t5_2s3qj,t3_28xz4v,t1_cifpkhw,9,2015-06-15 17:08:34,0,0,cifrcra,Bitcoin,9,,
2,it seems inherent to the idea of bitcoin/mining that if you're gonna use your tx for storage you should be charged for that. \n\nA tx fee on OP_return size makes sense.\n(like vbutterin says).\n\nMy miners arent your free cloud storage.,False,True,t1_cgbewr2,slimmtl,,0,2014-03-25 14:05:58,t5_2s3qj,t3_21b4pm,t3_21b4pm,12,2015-05-31 10:09:40,0,0,cgbewr2,Bitcoin,12,,
3,No. I just think that buttcoin has their own narrative they push. If its a real story then more information from better sources will surface.,False,True,t1_cfvfbgr,gonzobon,,0,2014-03-06 01:26:35,t5_2s3qj,t3_1zo631,t1_cfvf4mv,-5,2015-05-26 23:30:49,0,0,cfvfbgr,Bitcoin,-5,,
4,What?! Why? I just relay the information. Andreas and Jan are the coders. They are the best.,False,True,t1_cgtzbmv,Rassah,,0,2014-04-16 18:01:19,t5_2s3qj,t3_236k5d,t1_cgty4n6,9,2015-06-04 03:35:54,0,0,cgtzbmv,Bitcoin,9,,


In [85]:
reddit_parquet_path = "../data/reddit-crypto/parquet/"
reddit_comments.write.mode("overwrite").parquet(reddit_parquet_path)

In [71]:
reddit_comments.count()
12 602 728

11552230

In [80]:
reddit_comments.select("subreddit","body").distinct().groupBy("subreddit").count().show()

+---------+-------+
|subreddit|  count|
+---------+-------+
|ethtrader|1876878|
|      btc|1528438|
|  Bitcoin|6322365|
| ethereum| 473073|
+---------+-------+



In [84]:
from pyspark.sql.functions import min,max,count
reddit_comments.select("subreddit","body","created_utc")\
.groupBy("subreddit").agg(min("created_utc"),max("created_utc"),count("body"))\
.show()

+---------+-------------------+-------------------+-----------+
|subreddit|   min(created_utc)|   max(created_utc)|count(body)|
+---------+-------------------+-------------------+-----------+
|ethtrader|2015-03-25 20:31:50|2018-04-01 01:58:31|    2154164|
|      btc|2013-02-18 15:11:49|2018-04-01 01:59:17|    1663637|
|  Bitcoin|2010-09-11 00:15:22|2018-04-01 01:59:59|    7200070|
| ethereum|2014-01-05 13:52:48|2018-04-01 01:59:17|     534359|
+---------+-------------------+-------------------+-----------+



In [87]:
reddit_sentiment_parquet_path = "../data/reddit-crypto/parquet/sentiment/"
reddit_sentiment = spark.read.parquet(reddit_sentiment_parquet_path)

In [88]:
reddit_sentiment.limit(5).toPandas()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class,pos_vader,neg_vader,neu_vader,compound_vader,polarity_textblob,subjectivity_textblob
0,"He's joking. Are people really *this* serious on the Internet the whole time.\n\nEdit; Not to the first part he picked out. It is true SegWit is a misguided and convoluted attempt to scale on chain that brings the possibility of a double spend to the masses, and is a controversial change that's ...",,,,Mbizzle135,,,2016-09-15 11:24:13,t5_2si5v,t3_52v0z8,t1_d7nktph,10,2016-10-05 20:44:19,0,0,d7nln47,btc,10,,,0.059,0.098,0.843,-0.2682,0.051768,0.416414
1,"Did you read the whole OP? Because this ignores the other half of /u/ydtm's point, which is that Core has *also* refused to raise the blocksize cap, thus forcing *all* normal transactions (not just micro-transactions) off chain.",,,,ForkiusMaximus,,,2016-06-08 06:13:32,t5_2si5v,t3_4mzm94,t1_d3zzca3,9,2016-07-19 02:20:47,0,0,d40ged6,btc,9,,,0.0,0.116,0.884,-0.5537,0.014583,0.397917
2,I find it hard to believe that something like this would be used as a 'joke'.,,,,Harbingerx81,,,2016-04-01 15:34:49,t5_2si5v,t3_4cutaf,t1_d1lopv6,11,2016-05-13 16:00:58,0,0,d1lqsyj,btc,11,,,0.157,0.088,0.755,0.2732,-0.291667,0.541667
3,This whole sub is like an eposide of Colbert Report. I can't tell if most comments are satire or stupidity.,,,,pizzaface18,,,2016-02-17 04:57:41,t5_2si5v,t3_465zpe,t1_d02qs3l,-36,2016-03-21 10:14:00,0,0,d02r3fr,btc,-36,,,0.112,0.129,0.759,-0.1027,0.033333,0.633333
4,Installing it now. Kudos to the Classic team.,,,,chinawat,,,2016-02-10 17:01:24,t5_2si5v,t3_45321l,t3_45321l,60,2016-03-19 19:49:31,0,0,czus5w6,btc,60,,,0.32,0.0,0.68,0.5106,0.166667,0.166667


In [None]:
ethereum	515 679	 
2	ethtrader	2 044 269	 
3	Bitcoin	6 890 250	 
4	btc	1 665 816	 

In [None]:
1	ethereum	583 107	 
2	Bitcoin	7 855 267	 
3	btc	1 814 773	 
4	ethtrader	2 349 581