Links: 

https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#programming-model

https://github.com/syalanuj/youtube/blob/main/spark_streaming_with_python_in_12_minutes/spark_st_run.ipynb

https://github.com/Krupique/cursos-datascience-conteudo/blob/main/DSA_DS-02-BigData%20Analytics%20com%20Python%20e%20Spark/05%20-%20Introducao-SparkStreaming.ipynb


## Spark commands

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
from collections import namedtuple

In [None]:
spark = SparkSession.builder.appName('flatmap() PySpark').getOrCreate()


sample_data = ["This is The English We Speak from BBC Learning English", 
               "We’re learning https://t.co/8OS16pmODb about the phrase ‘put the brakes on’ something, ", 
               "which means to slow down or stop an activity.", 
               "And Neil, https://t.co/RgscZohkFB I think you’re buying too many bikes,", 
               "so you need to put the brakes on your spending."]

rdd = spark.sparkContext.parallelize(sample_data)

In [None]:
for element in rdd.collect():
    print(element)

In [None]:
fields = ("text", "count" )

Tweet = namedtuple( 'bbc', fields )


# Using Flatmap() Transformation
rdd2 = (rdd.flatMap(lambda x: x.split(" "))
    .filter( lambda word: word.lower().startswith("http") )
    .map( lambda word: (word.lower(), 1 ))
    .reduceByKey( lambda a, b: a + b )
    .map( lambda rec: Tweet( rec[0], rec[1] ) )
    
    )

for element in rdd2.collect():
    print(element)

In [None]:
rdd.count(), rdd2.count()

## Project

In [1]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
from collections import namedtuple

# Can only run this once. restart your kernel for any errors.

ssc = StreamingContext(sc, 10 )
sqlContext = SQLContext(sc)
socket_stream = ssc.socketTextStream("127.0.0.1", 5554)

lines = socket_stream.window( 1000 )


# Create DataFrame representing the stream of input lines from connection to localhost:9999
#lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9999).load()


fields = ("text", "count" )

# Use Parenthesis for multiple lines or use \.
Tweet = namedtuple( 'Tweet', fields )
( lines.flatMap( lambda text: text.split( " " ) ) 
 #.filter( lambda word: word.lower().startswith("http") )  
     .filter( lambda word: word.startswith("http") ) 
     .map( lambda word: ( word, 1 ) ) 
     .reduceByKey( lambda a, b: a + b ) 
     .map( lambda rec: Tweet( rec[0], rec[1] ) ) 
     .foreachRDD( lambda rdd: rdd.toDF().sort( desc("count") ) 
     .limit(100).registerTempTable("tweets") ) ) # Registers to a table.

## Now run TweetListener.py

## Spark

In [2]:
ssc.start()    

In [3]:
import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
# Only works for Jupyter Notebooks!
%matplotlib inline 

In [None]:
count = 0
while count < 10:
    
    time.sleep( 1 )
    top_10_tweets = sqlContext.sql( 'Select * from tweets' )
    top_10_df = top_10_tweets.toPandas()
    display.clear_output(wait=True)
    plt.figure( figsize = ( 10, 8 ) )
    #sns.barplot(x='count',y='land_cover_specific', data=df, palette='Spectral')
    sns.barplot( x="count", y="text", data=top_10_df)
    plt.show()
    
    
    count = count + 1

In [None]:
ssc.stop()  