### Create SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Predicting TweetWorld Emotion") \
    .getOrCreate()

### UDF (Transform from JSON Str with Sigle quotes to Double quotes)

In [2]:
import ast 
import json 
from pyspark.sql.functions import udf

def convert_json_double(json_single):
    json_dict = ast.literal_eval(json_single)
    return json.dumps(json_dict)
    
convert_json_double_udf = udf(lambda x: convert_json_double(x))

### Get the schema from sample tweet.

In [3]:
import pyspark.sql.functions as F

with open("data/tweet.txt", 'r') as f:
    originSingleQuotes = f.readline()
    originDoubleQuotes = convert_json_double(originSingleQuotes)

sc = spark.sparkContext
originRDD = sc.parallelize([originDoubleQuotes])
originDF = spark.read.json(originRDD)

columns = originDF.columns

### Send a connection request to Server Socket.

In [4]:
socketDF = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

### Transfrom data to analyze and extract data 

In [5]:
from pyspark.sql.functions import json_tuple

jsonDF = socketDF.select(convert_json_double_udf("value").alias("value"))
multiColDF = jsonDF.select(json_tuple("value", *columns)).toDF(*columns)
df = multiColDF.select("created_at", "text")

### Filter in English before estimating emotion

In [6]:
from langdetect import detect

def detect_language(text):
    return detect(text)

detect_language_udf = F.udf(lambda x: detect_language(x))

In [7]:
df = df.select("created_at","text", detect_language_udf("text").alias("lang"))

In [8]:
from pyspark.sql.functions import col
df = df.filter(col("lang") == "en")

# Estimating emotion

In [9]:
from textblob import TextBlob
positive = 2
netural = 1
negotive = 0

def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return positive
    elif sentiment < 0:
        return negotive
    else:
        return netural

get_sentiment_utf = F.udf(lambda x: get_sentiment(x))

In [10]:
sentimentDF = df.select("created_at", "text", get_sentiment_utf(col("text")).alias("sentiment_level"))

# run and debug

In [11]:
launch = sentimentDF \
    .writeStream \
    .outputMode("append") \
    .queryName("sentimentDF") \
    .format("memory") \
    .start()

In [16]:
spark.sql("select * from sentimentDF").show(20000)

+--------------------+--------------------+---------------+
|          created_at|                text|sentiment_level|
+--------------------+--------------------+---------------+
|Tue Oct 26 07:30:...|RT @Makgeo_lee: @...|              0|
|Tue Oct 26 07:30:...|RT @emilyIina: ya...|              0|
|Tue Oct 26 07:30:...|RT @ElegantLogic:...|              2|
+--------------------+--------------------+---------------+



In [17]:
spark.stop()