In [1]:
# First let's create our PySpark instance
# import findspark
# findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("sentiment_analysis_using_twitter").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


In [2]:
from pyspark.ml.feature import * #CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.sql.functions import * #col, udf,regexp_replace,isnull
from pyspark.sql.types import * #StringType,IntegerType
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# For pipeline development
from pyspark.ml import Pipeline 

# the coming cell is just for testing if the model works

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer
# Load the pre-trained sentiment analysis model
analyzer = SentimentIntensityAnalyzer()

# Load the dataset
data_string = "Hello I love this world so much #lovelive #hey"

# Create a tuple representing the row
row = (data_string,)

# Create a PySpark DataFrame with one row
df = spark.createDataFrame([row], ["text"])
print(df.show())
# Preprocess the text column in the dataset
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df = tokenizer.transform(df)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df = remover.transform(df)


# Define a UDF to apply the sentiment analysis model to each text
def analyze_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

sentiment_udf = udf(analyze_sentiment, StringType())

# Apply the sentiment analysis model to the text column
df = df.withColumn("sentiment", sentiment_udf(df["text"]))

# Output the sentiment analysis results
df.select("text", "sentiment").show()


+--------------------+
|                text|
+--------------------+
|Hello I love this...|
+--------------------+

None
+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|Hello I love this...|   0.6369|
+--------------------+---------+



In [4]:
# read the tweet data from socket
tweet_df = spark \
    .readStream \
    .format("socket") \
    .option("host", "127.0.0.1") \
    .option("port", 1234) \
    .load()

# type cast the column value
tweet_df_string = tweet_df.selectExpr("CAST(value AS STRING)")

In [5]:
def get_label(value):
    if value > 0:
        return True
    return False
pred_udf = udf(get_label, BooleanType())

In [6]:
tweets_tab = tweet_df_string.withColumn('sentiment', sentiment_udf(col('value'))) \
                            .withColumn('prediction', pred_udf(col('sentiment'))) \
                            .filter(col('value') != '') \
                            .select(['sentiment', 'prediction']) \
                            .groupby('prediction').agg({'prediction' :'count', 'sentiment' :'mean'})

In [7]:
writeTweet = tweets_tab.writeStream. \
    outputMode("complete"). \
    format("memory"). \
    queryName("tweetquery"). \
    trigger(processingTime='1 seconds'). \
    start()

print("----- streaming is running -------")

----- streaming is running -------


In [12]:
# Every time you run this cell, there will be fresh data!
# And the streaming keeps running
spark.sql("select * from tweetquery").toPandas()

Unnamed: 0,prediction,avg(sentiment),count(prediction)
0,True,0.565233,3
1,False,-0.106525,4


In [13]:
writeTweet.stop()

In [None]:
writeTweet.status

{'message': 'Terminated with exception: Connection refused',
 'isDataAvailable': False,
 'isTriggerActive': False}