In [3]:
import re

from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, FloatType
from pyspark.sql import functions as F
from pyspark.sql.functions import *

In [5]:
import os
import sys

TRESHOLD = 0.3

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

ModuleNotFoundError: No module named 'textblob'

In [6]:
from textblob import TextBlob

ModuleNotFoundError: No module named 'textblob'

In [11]:
from platform import python_version

print(python_version())

3.9.15


In [None]:
# Create a function to get the polarity
def getPolarity(tweet: str) -> float:
    return TextBlob(tweet).sentiment.polarity

# Create a function to get sentimental category
def getSentiment(polarityValue: int) -> str:
    if polarityValue < (-TRESHOLD):
        return 'Negative'
    elif polarityValue > TRESHOLD:
        return 'Positive'
    else:
        return "Neutral"

In [None]:
# Clean the tweet
def clean_tweet(tweet):
    r = tweet.lower()
    r = re.sub("'", "", r) # This is to avoid removing contractions in english
    r = re.sub("@[A-Za-z0-9_]+","", r)
    r = re.sub("#[A-Za-z0-9_]+","", r)
    r = re.sub(r'http\S+', '', r)
    r = re.sub('[()!?]', ' ', r)
    r = re.sub('\[.*?\]',' ', r)
    r = re.sub("[^a-z0-9]"," ", r)
    r = r.split()
    stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]
    r = [w for w in r if not w in stopwords]
    r = " ".join(word for word in r)
    return r


In [None]:
spark = SparkSession\
        .builder\
        .appName("TwitterSentimentAnalysis")\
        .master("local[*]")\
        .getOrCreate()

In [None]:
spark.sparkContext.setLogLevel('ERROR')

In [None]:
df = spark.readStream\
        .format("socket")\
        .option("host", "127.0.0.1")\
        .option("port", 3333)\
        .load()

In [None]:
df.printSchema()

In [None]:
tweet_schema = StructType().add("ID", "string").add("text", "string").add("created_at", "string")

In [None]:
values = df.select(from_json(df.value.cast("string"), tweet_schema).alias("tweet"))

In [None]:
values.printSchema()

In [None]:
df1 = values.select("tweet.*")

# Clean tweet
clean_tweets = F.udf(clean_tweet, StringType())
raw_tweets = df1.withColumn('processed_text', clean_tweets(col("text")))

    # Classify all tweet by sentiment treshhold
polarity = F.udf(getPolarity, FloatType())
sentiment = F.udf(getSentiment, StringType())
polarity_tweets = raw_tweets.withColumn("polarity", polarity(col("processed_text")))
sentiment_tweets = polarity_tweets.withColumn("sentiment", sentiment(col("polarity")))

In [None]:
sentiment_tweets.printSchema()

In [None]:
window_tweets = sentiment_tweets.select("*")\
    .groupby(window(sentiment_tweets.created_at, "60 seconds"), sentiment_tweets.sentiment) \
    .agg(count("*").alias("numEvents"))

In [None]:
    writeTweet = window_tweets.writeStream. \
    outputMode("complete"). \
    format("console"). \
    queryName("tweetquery"). \
    start()

    writeTweet.awaitTermination()

In [None]:
spark.stop()