# Create SparkSession

In [171]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Predicting TweetWorld Emotion") \
    .getOrCreate()

# UDF (Transform from JSON Str with Sigle quotes to Double quotes)

In [172]:
import ast 
import json 
from pyspark.sql.functions import udf

def convert_json_double(json_single):
    json_dict = ast.literal_eval(json_single)
    return json.dumps(json_dict)
    
convert_json_double_udf = udf(lambda x: convert_json_double(x))

# Get the column names from sample tweet.

In [173]:
import pyspark.sql.functions as F

with open("data/tweet.txt", 'r') as f:
    originSingleQuotes = f.readline()
    originDoubleQuotes = convert_json_double(originSingleQuotes)

sc = spark.sparkContext
originRDD = sc.parallelize([originDoubleQuotes])
originDF = spark.read.json(originRDD)

columns = originDF.columns

# Send a connection request to Server Socket.

In [174]:
socketDF = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Transfrom data to analyze and extract data 

In [175]:
from pyspark.sql.functions import json_tuple

jsonDF = socketDF.select(convert_json_double_udf("value").alias("value"))
multiColDF = jsonDF.select(json_tuple("value", *columns)).toDF(*columns)

In [176]:
df = multiColDF.select("created_at", "text")

# Filter in English before estimating emotion

In [177]:
from langdetect import detect

def detect_language(text):
    return detect(text)

detect_language_udf = F.udf(lambda x: detect_language(x))

In [178]:
df = df.select("created_at","text", detect_language_udf("text").alias("lang"))

In [179]:
from pyspark.sql.functions import col
df = df.filter(col("lang") == "en")

# Estimating emotion

In [180]:
from textblob import TextBlob
positive = 2
netural = 1
negotive = 0

def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return positive
    elif sentiment < 0:
        return negotive
    else:
        return netural

get_sentiment_utf = F.udf(lambda x: get_sentiment(x))

In [181]:
sentimentDF = df.select("created_at", "text", get_sentiment_utf(col("text")).alias("sentiment_level"))

# Aggregate sentiment_level by time

In [182]:
from pyspark.sql.functions import to_date

In [183]:
def text_datetime(txt):
    month_to_m = {
        'Jan': 1,
        'Feb': 2,
        'Mar': 3,
        'Apr': 4,
        'May': 5,
        'Jun': 6,
        'Jul': 7,
        'Aug': 8,
        'Sep': 9,
        'Oct': 10,
        'Nov': 11, 
        'Dec': 12,       
    }
    
#     txt = 'Thu Oct 21 07:02:44 +0000 2021'
    tmp = txt.split(" ")
    year = tmp[5]
    day_of_the_week = tmp[0]
    month = month_to_m[tmp[1]]
    day = tmp[2]
    time = tmp[3]
    return f'{month}-{day}-{year}'

text_datetime_udf = udf(lambda x: text_datetime(x))

In [187]:
new_df = sentimentDF.select(text_datetime_udf(col("created_at")).alias("created_at"))

In [188]:
new_df

DataFrame[created_at: string]

In [189]:
new_df = new_df.select("created_at", to_date(col("created_at"), 'MM-dd-yyyy'))

# run and debug

In [190]:
launch = new_df \
    .writeStream \
    .outputMode("append") \
    .queryName("sentimentDF") \
    .format("memory") \
    .start()

In [194]:
spark.sql("select * from sentimentDF").show(20000)

+----------+-------------------------------+
|created_at|to_date(created_at, MM-dd-yyyy)|
+----------+-------------------------------+
|10-27-2021|                     2021-10-27|
|10-27-2021|                     2021-10-27|
|10-27-2021|                     2021-10-27|
+----------+-------------------------------+



In [196]:
spark.stop()