In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
                        .master("local[8]")\
                        .config('spark.executor.memory', '5g')\
                        .config('spark.driver.memory', '5g')\
                        .appName("Tweet wrangeling")\
                        .config('spark.local.dir','~/.spark_tmp/')\
                        .getOrCreate()

# Sources

In [2]:
# twitter eth
eth_twitter_path = "../../data/tweets/ethereum/parquet/sentiment/" 
# twitter btc
btc_twitter_path = "../../data/tweets/bitcoin/parquet/sentiment/"

eth_twitter = spark.read.parquet(eth_twitter_path)
btc_twitter = spark.read.parquet(btc_twitter_path)

In [3]:
from pyspark.sql.functions import lit
twitter = eth_twitter.withColumn("crypto_tag",lit("eth"))\
            .union(
                btc_twitter.withColumn("crypto_tag",lit("btc"))
            )

In [4]:
# reddit
reddit_path = "../../data/reddit-crypto/parquet/sentiment/"

reddit = spark.read.parquet(reddit_path)

# Aggregation

The goal is to aggrigate the scores into 5 minutes intervals (like with the ticker data). From there they can be rolled up as needed to match the ticker data time indecies.

Aggrigations:
* ratio of pos/neg
* average 

In [5]:
twitter.printSchema()

root
 |-- username: string (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- retweets: integer (nullable = true)
 |-- favorites: integer (nullable = true)
 |-- geo: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- pos_vader: float (nullable = true)
 |-- neg_vader: float (nullable = true)
 |-- neu_vader: float (nullable = true)
 |-- compound_vader: float (nullable = true)
 |-- polarity_textblob: float (nullable = true)
 |-- subjectivity_textblob: float (nullable = true)
 |-- crypto_tag: string (nullable = false)



In [32]:
from pyspark.sql.functions import unix_timestamp, col
date_time = twitter.select("datetime","crypto_tag").withColumn("date_time",
                                           unix_timestamp(
                                               col("datetime"),'yyyy-MM-dd HH:mm:ss'
                                                           )
                                      )


In [34]:
from pyspark.sql.functions import floor, desc, from_unixtime

interval = 60*10#5 minutes 
date_time\
.withColumn("time_range",floor(col("date_time")/interval))\
.groupBy("time_range","crypto_tag")\
.count()\
.withColumn("grouped_date",from_unixtime(col("time_range")*interval))\
.sort(desc("grouped_date"),"crypto_tag")\
.show()

+----------+----------+-----+-------------------+
|time_range|crypto_tag|count|       grouped_date|
+----------+----------+-----+-------------------+
|   5092391|       btc|  100|2018-05-30 23:55:00|
|   5092391|       eth|   34|2018-05-30 23:55:00|
|   5092390|       btc|  125|2018-05-30 23:50:00|
|   5092390|       eth|   34|2018-05-30 23:50:00|
|   5092389|       btc|  113|2018-05-30 23:45:00|
|   5092389|       eth|   36|2018-05-30 23:45:00|
|   5092388|       btc|  115|2018-05-30 23:40:00|
|   5092388|       eth|   28|2018-05-30 23:40:00|
|   5092387|       btc|   91|2018-05-30 23:35:00|
|   5092387|       eth|   27|2018-05-30 23:35:00|
|   5092386|       btc|  118|2018-05-30 23:30:00|
|   5092386|       eth|   26|2018-05-30 23:30:00|
|   5092385|       btc|   92|2018-05-30 23:25:00|
|   5092385|       eth|   19|2018-05-30 23:25:00|
|   5092384|       btc|  115|2018-05-30 23:20:00|
|   5092384|       eth|   39|2018-05-30 23:20:00|
|   5092383|       btc|  132|2018-05-30 23:15:00|
