# Spark Setup

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, desc
from pyspark.sql.types import ArrayType, StringType, MapType, FloatType

In [3]:
spark = SparkSession.builder\
                        .master("local[*]")\
                        .config('spark.executor.memory', '5g')\
                        .config('spark.driver.memory', '5g')\
                        .config("spark.sql.session.timeZone", "UTC")\
                        .config("spark.sql.execution.arrow.enabled","true")\
                    .appName("Sentiment Analysis")\
                    .getOrCreate()
spark

In [4]:
spark

# Twitter Sentiments

In [4]:
from pyspark.sql.functions import col, concat, lit, date_format, desc, asc


parquet_eth_path = "../../data/tweets/ethereum/parquet/sentiment/"
eth = spark.read.parquet(parquet_eth_path)

In [5]:
from pyspark.sql.functions import col, concat, lit, date_format, desc, asc, min, max


In [6]:
parquet_eth_path = "../../data/tweets/bitcoin/parquet/sentiment/"
btc = spark.read.parquet(parquet_eth_path)

In [7]:
eth_ss = eth\
            .select("pos_vader","neg_vader","neu_vader","compound_vader","polarity_textblob","subjectivity_textblob","datetime")\
            .withColumn("Cryptocurrency",lit("Ethereum"))\

In [8]:
btc_ss = btc\
            .select("pos_vader","neg_vader","neu_vader","compound_vader","polarity_textblob","subjectivity_textblob","datetime")\
            .withColumn("Cryptocurrency",lit("Bitcoin"))\

In [9]:
both_ss = eth_ss.union(btc_ss).na.drop()

In [10]:
eth_ss.printSchema()

root
 |-- pos_vader: float (nullable = true)
 |-- neg_vader: float (nullable = true)
 |-- neu_vader: float (nullable = true)
 |-- compound_vader: float (nullable = true)
 |-- polarity_textblob: float (nullable = true)
 |-- subjectivity_textblob: float (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- Cryptocurrency: string (nullable = false)



In [11]:
from pyspark.sql.functions import when, lit, avg, month, year, dayofmonth, minute, hour
from pyspark.sql.functions import col as c

In [12]:
l10 = c("minute") <= 10
l20 = (c("minute") > 10) & (c("minute") <= 20)
l30 = (c("minute") > 20) & (c("minute") <= 30)
l40 = (c("minute") > 30) & (c("minute") <= 40)
l50 = (c("minute") > 40) & (c("minute") <= 50)
l60 = (c("minute") > 50) & (c("minute") <= 60)
#l35 = (c("minute") > 30) & (c("minute") <= 35)
#l40 = (c("minute") > 35) & (c("minute") <= 40)
#l45 = (c("minute") > 40) & (c("minute") <= 45)
#l50 = (c("minute") > 45) & (c("minute") <= 50)
#l55 = (c("minute") > 50) & (c("minute") <= 55)
#l60 = (c("minute") > 55) & (c("minute") <= 60)

w_t = when(l10,10)\
        .when(l20,20)\
        .when(l30,30)\
        .when(l40,40)\
        .when(l50,50)\
        .when(l60,60)\
        #.when(l35,35)\
        #.when(l40,40)\
        #.when(l45,45)\
        #.when(l50,50)\
        #.when(l55,55)\
        #.when(l60,60)


eth_ssm = eth_ss.withColumn("year",year("datetime"))\
                .withColumn("month",month("datetime"))\
                .withColumn("day",dayofmonth("datetime"))\
                .withColumn("hour",hour("datetime"))\
                .withColumn("minute",minute("datetime"))\
                .withColumn("bucket",w_t)

btc_ssm = btc_ss.withColumn("year",year("datetime"))\
                .withColumn("month",month("datetime"))\
                .withColumn("day",dayofmonth("datetime"))\
                .withColumn("hour",hour("datetime"))\
                .withColumn("minute",minute("datetime"))\
                .withColumn("bucket",w_t)
            
both_ssm = both_ss.withColumn("year",year("datetime"))\
                .withColumn("month",month("datetime"))\
                .withColumn("day",dayofmonth("datetime"))\
                .withColumn("hour",hour("datetime"))\
                .withColumn("minute",minute("datetime"))\
                .withColumn("bucket",w_t)

In [13]:
cv = eth_ssm.filter("compound_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("compound_vader").alias("avg_twitter_eth_compound_vader"),
)

pv = eth_ssm.filter("pos_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("pos_vader").alias("avg_twitter_eth_pos_vader")
)

nv = eth_ssm.filter("neg_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("neg_vader").alias("avg_twitter_eth_neg_vader")
)

pt = eth_ssm.filter("polarity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("polarity_textblob").alias("avg_twitter_eth_polarity_textblob"),   
)

st = eth_ssm.filter("subjectivity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("subjectivity_textblob").alias("avg_twitter_eth_subjectivity_textblob")    
)

eth_agg = cv.join(pv,on=["year","month","day","hour","bucket"])\
    .join(nv,on=["year","month","day","hour","bucket"],how="full")\
    .join(pt,on=["year","month","day","hour","bucket"],how="full")\
    .join(st,on=["year","month","day","hour","bucket"],how="full")



In [14]:
eth_agg.count()

114667

In [15]:
cv = btc_ssm.filter("compound_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("compound_vader").alias("avg_twitter_btc_compound_vader"),
)

pv = btc_ssm.filter("pos_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("pos_vader").alias("avg_twitter_btc_pos_vader")
)

nv = btc_ssm.filter("neg_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("neg_vader").alias("avg_twitter_btc_neg_vader")
)

pt = btc_ssm.filter("polarity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("polarity_textblob").alias("avg_twitter_btc_polarity_textblob"),   
)

st = btc_ssm.filter("subjectivity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("subjectivity_textblob").alias("avg_twitter_btc_subjectivity_textblob")    
)

btc_agg = cv.join(pv,on=["year","month","day","hour","bucket"],how="full")\
    .join(nv,on=["year","month","day","hour","bucket"],how="full")\
    .join(pt,on=["year","month","day","hour","bucket"],how="full")\
    .join(st,on=["year","month","day","hour","bucket"],how="full")



In [16]:
btc_agg.count()

112079

In [17]:
cv = both_ssm.filter("compound_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("compound_vader").alias("avg_twitter_compound_vader"),
)

pv = both_ssm.filter("pos_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("pos_vader").alias("avg_twitter_pos_vader")
)

nv = both_ssm.filter("neg_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("neg_vader").alias("avg_twitter_neg_vader")
)

pt = both_ssm.filter("polarity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("polarity_textblob").alias("avg_twitter_polarity_textblob"),   
)

st = both_ssm.filter("subjectivity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("subjectivity_textblob").alias("avg_twitter_subjectivity_textblob")    
)

both_agg = cv.join(pv,on=["year","month","day","hour","bucket"],how="full")\
    .join(nv,on=["year","month","day","hour","bucket"],how="full")\
    .join(pt,on=["year","month","day","hour","bucket"],how="full")\
    .join(st,on=["year","month","day","hour","bucket"],how="full")



In [18]:
everything_t = eth_agg.join(btc_agg,on=["year","month","day","hour","bucket"])\
                    .join(both_agg,on=["year","month","day","hour","bucket"],how="full")
                    

In [19]:
everything_t.count()

124738

In [20]:
everything_t.write.mode("overwrite").parquet("../../data/temp/sentiment/twitter")

In [21]:
#everything_t.limit(5).toPandas()

# Reddit Sentiments

In [22]:
from pyspark.sql.functions import when, col
from pyspark.sql.functions import col as c

when_statment = when(col("subreddit") == "ethtrader","Ethereum")\
                .when(col("subreddit") == "ethereum","Ethereum")\
                .when(col("subreddit") == "Bitcoin","Bitcoin")\
                .when(col("subreddit") == "btc","Bitcoin")

In [23]:
parquet_reddit_path = "../../data/reddit-crypto/parquet/complete_sentiment/"
redd = spark.read.parquet(parquet_reddit_path)\
                    .withColumn("Cryptocurrency",when_statment)\
                    .withColumnRenamed("created_utc","datetime")

In [24]:
eth_r_ss  = redd.select("subreddit","pos_vader","neg_vader","neu_vader","compound_vader","polarity_textblob","subjectivity_textblob","datetime")\
                .filter("Cryptocurrency = 'Ethereum'")\
                .drop("Cryptocurrency")


btc_r_ss  = redd.select("subreddit","pos_vader","neg_vader","neu_vader","compound_vader","polarity_textblob","subjectivity_textblob","datetime")\
                .filter("Cryptocurrency = 'Bitcoin'")\
                .drop("Cryptocurrency")


both_r_ss = redd.select("subreddit","pos_vader","neg_vader","neu_vader","compound_vader","polarity_textblob","subjectivity_textblob","datetime")\
                .drop("Cryptocurrency")







In [25]:
l10 = c("minute") <= 10
l20 = (c("minute") > 10) & (c("minute") <= 20)
l30 = (c("minute") > 20) & (c("minute") <= 30)
l40 = (c("minute") > 30) & (c("minute") <= 40)
l50 = (c("minute") > 40) & (c("minute") <= 50)
l60 = (c("minute") > 50) & (c("minute") <= 60)
#l35 = (c("minute") > 30) & (c("minute") <= 35)
#l40 = (c("minute") > 35) & (c("minute") <= 40)
#l45 = (c("minute") > 40) & (c("minute") <= 45)
#l50 = (c("minute") > 45) & (c("minute") <= 50)
#l55 = (c("minute") > 50) & (c("minute") <= 55)
#l60 = (c("minute") > 55) & (c("minute") <= 60)

w_t = when(l10,10)\
        .when(l20,20)\
        .when(l30,30)\
        .when(l40,40)\
        .when(l50,50)\
        .when(l60,60)\
        #.when(l35,35)\
        #.when(l40,40)\
        #.when(l45,45)\
        #.when(l50,50)\
        #.when(l55,55)\
        #.when(l60,60)


eth_r_ssm = eth_r_ss.withColumn("year",year("datetime"))\
                .withColumn("month",month("datetime"))\
                .withColumn("day",dayofmonth("datetime"))\
                .withColumn("hour",hour("datetime"))\
                .withColumn("minute",minute("datetime"))\
                .withColumn("bucket",w_t)

btc_r_ssm = btc_r_ss.withColumn("year",year("datetime"))\
                .withColumn("month",month("datetime"))\
                .withColumn("day",dayofmonth("datetime"))\
                .withColumn("hour",hour("datetime"))\
                .withColumn("minute",minute("datetime"))\
                .withColumn("bucket",w_t)
            
both_r_ssm = both_r_ss.withColumn("year",year("datetime"))\
                .withColumn("month",month("datetime"))\
                .withColumn("day",dayofmonth("datetime"))\
                .withColumn("hour",hour("datetime"))\
                .withColumn("minute",minute("datetime"))\
                .withColumn("bucket",w_t)

In [26]:
cv = eth_r_ssm.filter("compound_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("compound_vader").alias("avg_reddit_eth_compound_vader"),
)

pv = eth_r_ssm.filter("pos_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("pos_vader").alias("avg_reddit_eth_pos_vader")
)

nv = eth_r_ssm.filter("neg_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("neg_vader").alias("avg_reddit_eth_neg_vader")
)

pt = eth_r_ssm.filter("polarity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("polarity_textblob").alias("avg_reddit_eth_polarity_textblob"),   
)

st = eth_r_ssm.filter("subjectivity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("subjectivity_textblob").alias("avg_reddit_eth_subjectivity_textblob")    
)

eth_r_agg = cv.join(pv,on=["year","month","day","hour","bucket"],how="full")\
    .join(nv,on=["year","month","day","hour","bucket"],how="full")\
    .join(pt,on=["year","month","day","hour","bucket"],how="full")\
    .join(st,on=["year","month","day","hour","bucket"],how="full")



In [27]:
cv = btc_r_ssm.filter("compound_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("compound_vader").alias("avg_reddit_btc_compound_vader"),
)

pv = btc_r_ssm.filter("pos_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("pos_vader").alias("avg_reddit_btc_pos_vader")
)

nv = btc_r_ssm.filter("neg_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("neg_vader").alias("avg_reddit_btc_neg_vader")
)

pt = btc_r_ssm.filter("polarity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("polarity_textblob").alias("avg_reddit_btc_polarity_textblob"),   
)

st = btc_r_ssm.filter("subjectivity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("subjectivity_textblob").alias("avg_reddit_btc_subjectivity_textblob")    
)

btc_r_agg = cv.join(pv,on=["year","month","day","hour","bucket"],how="full")\
    .join(nv,on=["year","month","day","hour","bucket"],how="full")\
    .join(pt,on=["year","month","day","hour","bucket"],how="full")\
    .join(st,on=["year","month","day","hour","bucket"],how="full")



In [28]:
cv = both_r_ssm.filter("compound_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("compound_vader").alias("avg_reddit_compound_vader"),
)

pv = both_r_ssm.filter("pos_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("pos_vader").alias("avg_reddit_pos_vader")
)

nv = both_r_ssm.filter("neg_vader != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("neg_vader").alias("avg_reddit_neg_vader")
)

pt = both_r_ssm.filter("polarity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("polarity_textblob").alias("avg_reddit_polarity_textblob"),   
)

st = both_r_ssm.filter("subjectivity_textblob != 0")\
.groupBy("year","month","day","hour","bucket").agg(
avg("subjectivity_textblob").alias("avg_reddit_subjectivity_textblob")    
)

both_r_agg = cv.join(pv,on=["year","month","day","hour","bucket"])\
    .join(nv,on=["year","month","day","hour","bucket"],how="full")\
    .join(pt,on=["year","month","day","hour","bucket"],how="full")\
    .join(st,on=["year","month","day","hour","bucket"],how="full")



In [29]:
everything_r = eth_r_agg.join(btc_r_agg,on=["year","month","day","hour","bucket"])\
                    .join(both_r_agg,on=["year","month","day","hour","bucket"],how="full")
                    

In [30]:
everything_r.filter("year > 2015").count()

131292

In [31]:
227319

227319

In [32]:
240466*5/60/24/365

2.2875380517503805

In [33]:
everything_r.write.mode("overwrite").parquet("../../data/temp/sentiment/reddit")

In [34]:
#everything_r.limit(5).toPandas()

In [35]:
everything_r.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- bucket: integer (nullable = true)
 |-- avg_reddit_eth_compound_vader: double (nullable = true)
 |-- avg_reddit_eth_pos_vader: double (nullable = true)
 |-- avg_reddit_eth_neg_vader: double (nullable = true)
 |-- avg_reddit_eth_polarity_textblob: double (nullable = true)
 |-- avg_reddit_eth_subjectivity_textblob: double (nullable = true)
 |-- avg_reddit_btc_compound_vader: double (nullable = true)
 |-- avg_reddit_btc_pos_vader: double (nullable = true)
 |-- avg_reddit_btc_neg_vader: double (nullable = true)
 |-- avg_reddit_btc_polarity_textblob: double (nullable = true)
 |-- avg_reddit_btc_subjectivity_textblob: double (nullable = true)
 |-- avg_reddit_compound_vader: double (nullable = true)
 |-- avg_reddit_pos_vader: double (nullable = true)
 |-- avg_reddit_neg_vader: double (nullable = true)
 |-- avg_reddit_polarity_textblob: do

# Join all social media information

In [36]:
r = spark.read.parquet("../../data/temp/sentiment/reddit")\
            .filter("year > 2015")
t = spark.read.parquet("../../data/temp/sentiment/twitter").filter("year > 2015")

In [37]:
t.select(
    "year",
    "month",
    "day",
    "hour",
    "bucket"   
        )\
.sort(desc("year"),desc("month"),desc("day"),desc("hour")).show(2)

+----+-----+---+----+------+
|year|month|day|hour|bucket|
+----+-----+---+----+------+
|2018|    5| 30|  21|    20|
|2018|    5| 30|  21|    10|
+----+-----+---+----+------+
only showing top 2 rows



In [38]:
t.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- bucket: integer (nullable = true)
 |-- avg_twitter_eth_compound_vader: double (nullable = true)
 |-- avg_twitter_eth_pos_vader: double (nullable = true)
 |-- avg_twitter_eth_neg_vader: double (nullable = true)
 |-- avg_twitter_eth_polarity_textblob: double (nullable = true)
 |-- avg_twitter_eth_subjectivity_textblob: double (nullable = true)
 |-- avg_twitter_btc_compound_vader: double (nullable = true)
 |-- avg_twitter_btc_pos_vader: double (nullable = true)
 |-- avg_twitter_btc_neg_vader: double (nullable = true)
 |-- avg_twitter_btc_polarity_textblob: double (nullable = true)
 |-- avg_twitter_btc_subjectivity_textblob: double (nullable = true)
 |-- avg_twitter_compound_vader: double (nullable = true)
 |-- avg_twitter_pos_vader: double (nullable = true)
 |-- avg_twitter_neg_vader: double (nullable = true)
 |-- avg_twitter_polarit

In [39]:
r.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- bucket: integer (nullable = true)
 |-- avg_reddit_eth_compound_vader: double (nullable = true)
 |-- avg_reddit_eth_pos_vader: double (nullable = true)
 |-- avg_reddit_eth_neg_vader: double (nullable = true)
 |-- avg_reddit_eth_polarity_textblob: double (nullable = true)
 |-- avg_reddit_eth_subjectivity_textblob: double (nullable = true)
 |-- avg_reddit_btc_compound_vader: double (nullable = true)
 |-- avg_reddit_btc_pos_vader: double (nullable = true)
 |-- avg_reddit_btc_neg_vader: double (nullable = true)
 |-- avg_reddit_btc_polarity_textblob: double (nullable = true)
 |-- avg_reddit_btc_subjectivity_textblob: double (nullable = true)
 |-- avg_reddit_compound_vader: double (nullable = true)
 |-- avg_reddit_pos_vader: double (nullable = true)
 |-- avg_reddit_neg_vader: double (nullable = true)
 |-- avg_reddit_polarity_textblob: do

In [40]:
everything = r.join(t,on=["year","month","day","hour","bucket"],how="full")

In [41]:
everything.write.mode("overwrite").parquet("../../data/temp/sentiment/everything")

In [42]:
spark.read.parquet("../../data/temp/sentiment/everything").sort("year","month","day","hour","bucket").printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- bucket: integer (nullable = true)
 |-- avg_reddit_eth_compound_vader: double (nullable = true)
 |-- avg_reddit_eth_pos_vader: double (nullable = true)
 |-- avg_reddit_eth_neg_vader: double (nullable = true)
 |-- avg_reddit_eth_polarity_textblob: double (nullable = true)
 |-- avg_reddit_eth_subjectivity_textblob: double (nullable = true)
 |-- avg_reddit_btc_compound_vader: double (nullable = true)
 |-- avg_reddit_btc_pos_vader: double (nullable = true)
 |-- avg_reddit_btc_neg_vader: double (nullable = true)
 |-- avg_reddit_btc_polarity_textblob: double (nullable = true)
 |-- avg_reddit_btc_subjectivity_textblob: double (nullable = true)
 |-- avg_reddit_compound_vader: double (nullable = true)
 |-- avg_reddit_pos_vader: double (nullable = true)
 |-- avg_reddit_neg_vader: double (nullable = true)
 |-- avg_reddit_polarity_textblob: do

# Shape the data to reflect what I need

In [5]:
merged_data = spark.read.parquet("../../data/temp/sentiment/everything").sort("year","month","day","hour","bucket")\
            .toPandas()\
            .rename(columns={"bucket":"minutes"})

In [6]:
import pandas as pd

In [31]:
merged_data["datetime"] = pd.to_datetime(merged_data[["year","month","day","hour","minutes"]])

In [32]:
t_formated = merged_data\
.drop(columns=["year","month","hour","minutes","day"])\
.set_index("datetime")\
.loc[:'2018-5-30 22:00:00']

In [33]:
idx = pd.date_range(start='2016-1-01', end="2018-5-30 22:00:00", freq='600s')

In [34]:
t_formated.index = pd.DatetimeIndex(t_formated.index)

t_formated = t_formated.reindex(idx, fill_value=0)

In [35]:
t_formated = t_formated.interpolate(method='time')

In [36]:
rename_mapper_4days = {'avg_reddit_eth_compound_vader':'avg_4day_reddit_eth_compound_vader', 
                 'avg_reddit_eth_pos_vader':'avg_4day_reddit_eth_pos_vader',
                 'avg_reddit_eth_neg_vader':'avg_4day_reddit_eth_neg_vader', 
                 'avg_reddit_eth_polarity_textblob':'avg_4day_reddit_eth_polarity_textblob',
                 'avg_reddit_eth_subjectivity_textblob':'avg_4day_reddit_eth_subjectivity_textblob', 
                 'avg_reddit_btc_compound_vader':'avg_4day_reddit_btc_compound_vader',
                 'avg_reddit_btc_pos_vader':'avg_4day_reddit_btc_pos_vader', 
                 'avg_reddit_btc_neg_vader':'avg_4day_reddit_btc_neg_vader',
                 'avg_reddit_btc_polarity_textblob':'avg_4day_reddit_btc_polarity_textblob',
                 'avg_reddit_btc_subjectivity_textblob':'avg_4day_reddit_btc_subjectivity_textblob', 
                 'avg_reddit_compound_vader':'avg_4day_reddit_compound_vader',
                 'avg_reddit_pos_vader':'avg_4day_reddit_pos_vader', 
                 'avg_reddit_neg_vader':'avg_4day_reddit_neg_vader',
                 'avg_reddit_polarity_textblob':'avg_4day_reddit_polarity_textblob', 
                 'avg_reddit_subjectivity_textblob':'avg_4day_reddit_subjectivity_textblob',
                 'avg_twitter_eth_compound_vader':'avg_4day_twitter_eth_compound_vader', 
                 'avg_twitter_eth_pos_vader':'avg_4day_twitter_eth_pos_vader', 
                 'avg_twitter_eth_neg_vader':'avg_4day_twitter_eth_neg_vader',
                 'avg_twitter_eth_polarity_textblob':'avg_4day_twitter_eth_polarity_textblob', 
                 'avg_twitter_eth_subjectivity_textblob':'avg_4day_twitter_eth_subjectivity_textblob',
                 'avg_twitter_btc_compound_vader':'avg_4day_twitter_btc_compound_vader', 
                 'avg_twitter_btc_pos_vader':'avg_4day_twitter_btc_pos_vader',
                 'avg_twitter_btc_neg_vader':'avg_4day_twitter_btc_neg_vader', 
                 'avg_twitter_btc_polarity_textblob':'avg_4day_twitter_btc_polarity_textblob',
                 'avg_twitter_btc_subjectivity_textblob':'avg_4day_twitter_btc_subjectivity_textblob', 
                 'avg_twitter_compound_vader':'avg_4day_twitter_compound_vader',
                 'avg_twitter_pos_vader':'avg_4day_twitter_pos_vader', 
                 'avg_twitter_neg_vader':'avg_4day_twitter_neg_vader',
                 'avg_twitter_polarity_textblob':'avg_4day_twitter_polarity_textblob', 
                 'avg_twitter_subjectivity_textblob':'avg_4day_twitter_subjectivity_textblob'}


rename_mapper_2days = {'avg_reddit_eth_compound_vader':'avg_2day_reddit_eth_compound_vader', 
                 'avg_reddit_eth_pos_vader':'avg_2day_reddit_eth_pos_vader',
                 'avg_reddit_eth_neg_vader':'avg_2day_reddit_eth_neg_vader', 
                 'avg_reddit_eth_polarity_textblob':'avg_2day_reddit_eth_polarity_textblob',
                 'avg_reddit_eth_subjectivity_textblob':'avg_2day_reddit_eth_subjectivity_textblob', 
                 'avg_reddit_btc_compound_vader':'avg_2day_reddit_btc_compound_vader',
                 'avg_reddit_btc_pos_vader':'avg_2day_reddit_btc_pos_vader', 
                 'avg_reddit_btc_neg_vader':'avg_2day_reddit_btc_neg_vader',
                 'avg_reddit_btc_polarity_textblob':'avg_2day_reddit_btc_polarity_textblob',
                 'avg_reddit_btc_subjectivity_textblob':'avg_2day_reddit_btc_subjectivity_textblob', 
                 'avg_reddit_compound_vader':'avg_2day_reddit_compound_vader',
                 'avg_reddit_pos_vader':'avg_2day_reddit_pos_vader', 
                 'avg_reddit_neg_vader':'avg_2day_reddit_neg_vader',
                 'avg_reddit_polarity_textblob':'avg_2day_reddit_polarity_textblob', 
                 'avg_reddit_subjectivity_textblob':'avg_2day_reddit_subjectivity_textblob',
                 'avg_twitter_eth_compound_vader':'avg_2day_twitter_eth_compound_vader', 
                 'avg_twitter_eth_pos_vader':'avg_2day_twitter_eth_pos_vader', 
                 'avg_twitter_eth_neg_vader':'avg_2day_twitter_eth_neg_vader',
                 'avg_twitter_eth_polarity_textblob':'avg_2day_twitter_eth_polarity_textblob', 
                 'avg_twitter_eth_subjectivity_textblob':'avg_2day_twitter_eth_subjectivity_textblob',
                 'avg_twitter_btc_compound_vader':'avg_2day_twitter_btc_compound_vader', 
                 'avg_twitter_btc_pos_vader':'avg_2day_twitter_btc_pos_vader',
                 'avg_twitter_btc_neg_vader':'avg_2day_twitter_btc_neg_vader', 
                 'avg_twitter_btc_polarity_textblob':'avg_2day_twitter_btc_polarity_textblob',
                 'avg_twitter_btc_subjectivity_textblob':'avg_2day_twitter_btc_subjectivity_textblob', 
                 'avg_twitter_compound_vader':'avg_2day_twitter_compound_vader',
                 'avg_twitter_pos_vader':'avg_2day_twitter_pos_vader', 
                 'avg_twitter_neg_vader':'avg_2day_twitter_neg_vader',
                 'avg_twitter_polarity_textblob':'avg_2day_twitter_polarity_textblob', 
                 'avg_twitter_subjectivity_textblob':'avg_2day_twitter_subjectivity_textblob'}

In [37]:
days4 = int(4*24*60/10) #5 min interval
days2 = int(2*24*60/10) #5 min interval

feature_4days = t_formated\
        .shift(days4)\
        .rename(columns=rename_mapper_4days)

feature_2days = t_formated\
        .shift(days2)\
        .rename(columns=rename_mapper_2days)

In [38]:
sentiments = t_formated\
.merge(
feature_2days.merge(
    feature_4days,left_index=True,right_index=True)
    ,left_index=True,right_index=True
)

In [39]:
sentiments = sentiments.loc['2016-1-5 00:00:00':'2018-5-30 22:00:00']

In [42]:
sentiments.to_parquet("../../data/features/sentiment_features")

In [54]:
for i in sentiments.columns:
    print(i)

avg_reddit_eth_compound_vader
avg_reddit_eth_pos_vader
avg_reddit_eth_neg_vader
avg_reddit_eth_polarity_textblob
avg_reddit_eth_subjectivity_textblob
avg_reddit_btc_compound_vader
avg_reddit_btc_pos_vader
avg_reddit_btc_neg_vader
avg_reddit_btc_polarity_textblob
avg_reddit_btc_subjectivity_textblob
avg_reddit_compound_vader
avg_reddit_pos_vader
avg_reddit_neg_vader
avg_reddit_polarity_textblob
avg_reddit_subjectivity_textblob
avg_twitter_eth_compound_vader
avg_twitter_eth_pos_vader
avg_twitter_eth_neg_vader
avg_twitter_eth_polarity_textblob
avg_twitter_eth_subjectivity_textblob
avg_twitter_btc_compound_vader
avg_twitter_btc_pos_vader
avg_twitter_btc_neg_vader
avg_twitter_btc_polarity_textblob
avg_twitter_btc_subjectivity_textblob
avg_twitter_compound_vader
avg_twitter_pos_vader
avg_twitter_neg_vader
avg_twitter_polarity_textblob
avg_twitter_subjectivity_textblob
avg_2day_reddit_eth_compound_vader
avg_2day_reddit_eth_pos_vader
avg_2day_reddit_eth_neg_vader
avg_2day_reddit_eth_polarity_t