In [1]:
# data analysis
import findspark
findspark.init()
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import col,udf,monotonically_increasing_id,unix_timestamp,round,avg,split,size
from pyspark.sql.types import *
sc = SparkContext("local", "first app")
sql = SQLContext(sc)

# visualization
from IPython import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import cm

#sentiment analysis
from textblob import TextBlob
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Load Tweet dataframe

In [3]:
# https://datascience.stackexchange.com/questions/13123/import-csv-file-contents-into-pyspark-dataframes
ps_df = sql.read.format("com.databricks.spark.csv").options(header="true", inferschema='true').load("data/output_2018-11-11-to-2014-12-31-1000-perdate.csv")

In [4]:
ps_df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- permalink: string (nullable = true)



In [5]:
ps_df.show(3)

+-------------------+--------------------+-------------------+--------------------+
|               date|                text|                 id|           permalink|
+-------------------+--------------------+-------------------+--------------------+
|2018-11-12 18:59:51|Greg Maxwell: Bit...|1062132900940992513|https://twitter.c...|
|2018-11-12 18:59:50|Okay, I'll bite. ...|1062132895723257861|https://twitter.c...|
|2018-11-12 18:59:49|@paulvigna Have y...|1062132890207629312|https://twitter.c...|
+-------------------+--------------------+-------------------+--------------------+
only showing top 3 rows



In [8]:
#ps_df = ps_tw_df.withColumnRenamed('date', 'date_time') #setting column names of Twitter dataset
#ps_df = ps_df.withColumnRenamed('tweet', 'tweets') #setting column names of Twitter dataset

In [6]:
def clean_tweet(tweet):
    '''
        Utility function to clean the text in a tweet by removing: stop words, links, special characters using regex.
    Args:
        tweet: DataFrame column 'text'
    Returns:
        
    '''
    tweet = str(tweet)
    tweet = tweet.lower()
    tweet = [word for word in tweet.split() if word not in stop]
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", str(tweet)).split())

func_udf =  udf(clean_tweet, StringType())
ps_df = ps_df.withColumn('cleaned_tweets', func_udf(ps_df['text']))
ps_df.show(3)

+-------------------+--------------------+-------------------+--------------------+--------------------+
|               date|                text|                 id|           permalink|      cleaned_tweets|
+-------------------+--------------------+-------------------+--------------------+--------------------+
|2018-11-12 18:59:51|Greg Maxwell: Bit...|1062132900940992513|https://twitter.c...|greg maxwell bitc...|
|2018-11-12 18:59:50|Okay, I'll bite. ...|1062132895723257861|https://twitter.c...|okay i ll bite bl...|
|2018-11-12 18:59:49|@paulvigna Have y...|1062132890207629312|https://twitter.c...|heard great new c...|
+-------------------+--------------------+-------------------+--------------------+--------------------+
only showing top 3 rows



## Sentiment analysis: Text Blob 

In [7]:
#https://github.com/harishpuvvada/BitCoin-Value-Predictor/blob/master/Data_PreProcessing.ipynb
def analyze_sentiment_tb(tweet):
    '''
        Classify the polarity of a tweet using textblob.
    Args:
        tweet:
    Returns:
    
    '''
    analysis = TextBlob(tweet)
    polarity = analysis.sentiment.polarity
    return polarity
    
func_udf2 = udf(analyze_sentiment_tb, ArrayType(FloatType()))
ps_df = ps_df.withColumn('sentiment_txtblob', func_udf2(ps_df['cleaned_tweets'])[0])
ps_df.show(3)

+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+
|               date|                text|                 id|           permalink|      cleaned_tweets|sentiment_txtblob|
+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+
|2018-11-12 18:59:51|Greg Maxwell: Bit...|1062132900940992513|https://twitter.c...|greg maxwell bitc...|             null|
|2018-11-12 18:59:50|Okay, I'll bite. ...|1062132895723257861|https://twitter.c...|okay i ll bite bl...|             null|
|2018-11-12 18:59:49|@paulvigna Have y...|1062132890207629312|https://twitter.c...|heard great new c...|             null|
+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+
only showing top 3 rows



## Sentiment analysis:  Vader

In [8]:
analyser = SentimentIntensityAnalyzer()

def analyze_sentiment_v(sentence):
    snt = analyser.polarity_scores(sentence)
    return ([snt['neg'], snt['neu'], snt['pos'], snt['compound']])
    #return (snt['compound'])

func_udf2 = udf(analyze_sentiment_v, ArrayType(FloatType()))
ps_df = ps_df.withColumn('sentiment_vader', func_udf2(ps_df['cleaned_tweets'])[0])
ps_df.show(3)

+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+---------------+
|               date|                text|                 id|           permalink|      cleaned_tweets|sentiment_txtblob|sentiment_vader|
+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+---------------+
|2018-11-12 18:59:51|Greg Maxwell: Bit...|1062132900940992513|https://twitter.c...|greg maxwell bitc...|             null|          0.162|
|2018-11-12 18:59:50|Okay, I'll bite. ...|1062132895723257861|https://twitter.c...|okay i ll bite bl...|             null|            0.0|
|2018-11-12 18:59:49|@paulvigna Have y...|1062132890207629312|https://twitter.c...|heard great new c...|             null|            0.0|
+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+---------------+
only showing top 3 rows



#### Word count

In [9]:
ps_df = ps_df.withColumn('wordCount', size(split(col('cleaned_tweets'), ' ')))
ps_df.show(3)

+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+---------------+---------+
|               date|                text|                 id|           permalink|      cleaned_tweets|sentiment_txtblob|sentiment_vader|wordCount|
+-------------------+--------------------+-------------------+--------------------+--------------------+-----------------+---------------+---------+
|2018-11-12 18:59:51|Greg Maxwell: Bit...|1062132900940992513|https://twitter.c...|greg maxwell bitc...|             null|          0.162|       29|
|2018-11-12 18:59:50|Okay, I'll bite. ...|1062132895723257861|https://twitter.c...|okay i ll bite bl...|             null|            0.0|       31|
|2018-11-12 18:59:49|@paulvigna Have y...|1062132890207629312|https://twitter.c...|heard great new c...|             null|            0.0|       23|
+-------------------+--------------------+-------------------+--------------------+--------------------+--

In [10]:
ps_df = ps_df.selectExpr('date','permalink','cleaned_tweets','sentiment_vader','wordCount')

In [11]:
ps_df.show(3)

+-------------------+--------------------+--------------------+---------------+---------+
|               date|           permalink|      cleaned_tweets|sentiment_vader|wordCount|
+-------------------+--------------------+--------------------+---------------+---------+
|2018-11-12 18:59:51|https://twitter.c...|greg maxwell bitc...|          0.162|       29|
|2018-11-12 18:59:50|https://twitter.c...|okay i ll bite bl...|            0.0|       31|
|2018-11-12 18:59:49|https://twitter.c...|heard great new c...|            0.0|       23|
+-------------------+--------------------+--------------------+---------------+---------+
only showing top 3 rows

