# How to load json typed data from socket?
There is pyspark.sql.streaming.DataStreamReader.json
<a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.streaming.DataStreamReader.json.html#pyspark.sql.streaming.DataStreamReader.json">(reference)</a>

However, It could only read from a file stored in a directory. Not Socket untill I found at least.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Load Streaming Data") \
    .getOrCreate()

## 1. load static data from a sample data

In [4]:
from pyspark.sql.types import StructType

In [15]:
static = spark.read.json("../data/sample.json")

In [17]:
static.show()

+----+------+
|Name|  Type|
+----+------+
| JIN|Person|
+----+------+



In [None]:
sta

## 2. load data using Socket

In [None]:
socketStream = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()    

## 3. Transfom the data from socket to form of JSON

# =======================================

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

sc = spark.sparkContext

# Data form to load

{'data':<br> 
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'id': '1442320619366850564',<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'text': 'RT @AyanamiMatoi: Un sexo se puede?\nShelly Starbucks. https://t.co/cCOe0pbw3j'<br>},<br>
&nbsp;&nbsp;'matching_rules': [{'id': '1442320624253161474', 'tag': ''}]<br>
    }<br>

# Thinking what column do I need / how to extract
(1) This data has a nested form<br>
(2) I need a text column only.<br>

In [None]:
from pyspark.sql import functions as F

In [None]:
path = "/Users/seongjin/programming/tweet_to_dashboard/data/tweet.txt"

originDF = spark.read.json(path)
originDF.printSchema()

# extracting id, text columns only

In [None]:
idTextDF = originDF.select('data.id', 'data.text')

In [None]:
idTextDF.printSchema()

In [None]:
idTextDF.show(3)

# Filter in English before estimating emotion

1. Make a def
2. make a UDF using above (1)

In [None]:
from langdetect import detect

def detect_language(text):
    return detect(text)

detect_language_udf = F.udf(lambda x: detect_language(x))

In [None]:
idTextLangDF = idTextDF.select(
    col("id"), col("text"), detect_language_udf(col("text")).alias("lang")
)

In [None]:
enDF = idTextLangDF.filter(col("lang") == "en")
enDF.show(3)

# Estimating emotion

In [None]:
from textblob import TextBlob
positive = 2
netural = 1
negotive = 0

def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return positive
    elif sentiment < 0:
        return negotive
    else:
        return netural

get_sentiment_utf = F.udf(lambda x: get_sentiment(x))
sentimentDF = idTextDF.select(col("id"), col("text"), get_sentiment_utf(col("text")).alias("sentiment_level"))

In [None]:
sentimentDF = enDF.select(col("text"), get_sentiment_utf(col("text")).alias("sentiment"))

In [None]:
sentimentDF.show()