In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, count, rand, collect_list, explode, struct, count, lit
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [2]:
t_eth = spark.read.parquet("s3://blunova-databricks-data/eth")

In [3]:
from pyspark.sql.types import FloatType, StructType

schema = StructType([
    StructField("pos_vader", FloatType(), False),
    StructField("neg_vader", FloatType(), False),
    StructField("neu_vader", FloatType(), False),
    StructField("compound_vader", FloatType(), False),
    StructField("polarity_textblob", FloatType(), False),
    StructField("subjectivity_textblob", FloatType(), False)
])

@udf(schema)
def sentiment_scores(sentence):

  if sentence == None:
      return (None,None,None,None,None,None)
  
  sentence = str(sentence)
  
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
  from textblob import TextBlob

  textblob = TextBlob(sentence)

  polarity = textblob.sentiment.polarity
  subjectivity = textblob.sentiment.subjectivity
  
  analyzer = SentimentIntensityAnalyzer()
  vs = analyzer.polarity_scores(sentence)
  return (vs.get("pos"),vs.get("neg"),vs.get("neu"),vs.get("compound"),polarity,subjectivity)


In [4]:
#sample = t_eth.filter("datetime < '2016-03-01'")
#30 seconds
#print(sample.count())
x = t_eth.withColumn('sentiment',sentiment_scores(t_eth.text))
#x.show()
#x.write.mode('overwrite').parquet("s3://blunova-databricks-data/results/eth_sentiment/")

In [5]:
x.select(
"username",
"datetime",
"text",
"retweets",
"favorites",
"geo",
"mentions",
"hashtags",
"id",
"permalink",
"sentiment.pos_vader",
"sentiment.neg_vader",
"sentiment.neu_vader",
"sentiment.compound_vader",
"sentiment.polarity_textblob",
"sentiment.subjectivity_textblob"
).printSchema()

In [6]:
#sample = t_eth.filter("datetime < '2016-12-01'")
#x = sample.withColumn('sentiment',sentiment_scores(sample.text))

x = t_eth.withColumn('sentiment',sentiment_scores(t_eth.text))

x.select(
"username",
"datetime",
"text",
"retweets",
"favorites",
"geo",
"mentions",
"hashtags",
"id",
"permalink",
"sentiment.pos_vader",
"sentiment.neg_vader",
"sentiment.neu_vader",
"sentiment.compound_vader",
"sentiment.polarity_textblob",
"sentiment.subjectivity_textblob"
)\
.write.mode('overwrite').parquet("s3://blunova-databricks-data/results/eth_sentiment/")

In [7]:
from pyspark.sql.functions import date_format, sum

pd = spark.read.parquet("s3://blunova-databricks-data/results/eth_sentiment/")\
.withColumn('date',date_format('datetime','yyyy-MM-dd'))\
.groupby('date')\
.agg(
sum("pos_vader"),
sum("neg_vader"),
sum("neu_vader"),
sum("compound_vader"),
sum("polarity_textblob"),
sum("subjectivity_textblob")
).toPandas()

In [8]:
print(spark.read.parquet("s3://blunova-databricks-data/results/eth_sentiment/").count())
print(spark.read.parquet("s3://blunova-databricks-data/results/eth_sentiment/").where(col('pos_vader').isNotNull()).count())

In [9]:
t_btc = spark.read.parquet("s3://blunova-databricks-data/testing/btc/")
t_btc.count()

In [10]:
x = t_btc.withColumn('sentiment',sentiment_scores(t_btc.text))\
          .select(
          "username",
          "datetime",
          "text",
          "retweets",
          "favorites",
          "geo",
          "mentions",
          "hashtags",
          "id",
          "permalink",
          "sentiment.pos_vader",
          "sentiment.neg_vader",
          "sentiment.neu_vader",
          "sentiment.compound_vader",
          "sentiment.polarity_textblob",
          "sentiment.subjectivity_textblob"
          )

x.write.mode('overwrite').parquet("s3://blunova-databricks-data/results/btc_sentiment/")

In [11]:
spark.read.parquet("s3://blunova-databricks-data/results/btc_sentiment/").show()

In [12]:
#>>> testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
#>>> testimonial.sentiment
#Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)
#>>> testimonial.sentiment.polarity

In [13]:
r = spark.read.parquet("s3://blunova-databricks-data/testing/")
r.count()

In [14]:
r.printSchema()

In [15]:
x = r.withColumn('sentiment',sentiment_scores(r.body))

x.select(
"body",
"score_hidden",
"archived",
"name",
"author",
"author_flair_text",
"downs",
"created_utc",
"subreddit_id",
"link_id",
"parent_id",
"score",
"retrieved_on",
"controversiality",
"gilded",
"id",
"subreddit",
"ups",
"distinguished",
"author_flair_css_class",
"sentiment.pos_vader",
"sentiment.neg_vader",
"sentiment.neu_vader",
"sentiment.compound_vader",
"sentiment.polarity_textblob",
"sentiment.subjectivity_textblob"
)\
.write.mode('overwrite').parquet("s3://blunova-databricks-data/results/r/")

In [16]:
spark.read.parquet("s3://blunova-databricks-data/results/r/").count()