In [None]:
import re
#import pyspark.sql.functions as F
from pyspark.sql import Row #Converte RDDs em objetos do tipo Row
from pyspark.sql.functions import col, isnan, when, count # Encontra a contagem para valores None, Null, Nan, etc.
from pyspark.sql.types import IntegerType, FloatType

from nltk.corpus import stopwords

In [None]:
def removerStopwords(texto):
    por = stopwords.words('portuguese')
    eng = stopwords.words('english')
    spa = stopwords.words('spanish')

    stop_words = por + eng + spa
    
    texto = ' '.join(palavra for palavra in texto.split(' ') if palavra not in stop_words)
    return texto

In [None]:
def limparTexto(texto):
    # Remove emojis
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    texto = emoji_pattern.sub(r'', texto) # no emoji
    
    # Remove non-english chars: chinese, arabic, korean, etc...
    nonenglish_pattern = re.compile(u'[^\u0000-\u05C0\u2100-\u214F]+', flags=re.UNICODE)
    texto = nonenglish_pattern.sub(r'', texto)
    
    #Vamos transformar o texto em lowercase, remover textos entre colchetes, links, pontuações e palavras que contenham números.
    texto = str(texto).lower()
    texto = re.sub('\[.*?\]', '', texto) #Removendo textos entre colchetes
    texto = re.sub('<.*?>+', '', texto)  # Remove textos entre <>
    texto = re.sub('https?://\S+|www\.\S+', '', texto) #Removendo links
    texto = re.sub('[@#]\S+', '', texto) #Removendo arrobas e hashtags
    texto = re.sub('\w*\d\w*', '', texto) #Remove palavras contendo dígitos no meio.
        
    texto = re.sub(r'[%@#\t\n\r]+', '', texto) #Remove caracteres especiais
    texto = re.sub(r'[ ]+', ' ', texto) #Remove mais do que um espaço em branco
    
    return (texto)

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("127.0.0.1", 5554)

In [None]:
#words_trans = lines.map(lambda x: limparTexto(x)).map(lambda x: removerStopwords(x))

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [None]:
ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

---

---

---

---

---

Links: 

https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#programming-model

https://github.com/syalanuj/youtube/blob/main/spark_streaming_with_python_in_12_minutes/spark_st_run.ipynb

https://github.com/Krupique/cursos-datascience-conteudo/blob/main/DSA_DS-02-BigData%20Analytics%20com%20Python%20e%20Spark/05%20-%20Introducao-SparkStreaming.ipynb


In [None]:
lista = [
    'É necessário muita ação',
    'I plan nothing on my weekend ！#cute #可爱 #萌 #cat https://t.co/JNbftLiVOM',
         '@PAVGOD: Whenever someone asks if Pavlov is a cuddle dog, \
         I try to explain that this is his reaction 😂 https://t.co/KGrI4pHEXV',
         'Cat feed cat https://t.co/1Avv8qhT4J 🗣️"Tenim el deure de complir amb el mandat del 52 %  \
         davançar cap a la independència'
        ]

rdd = sc.parallelize(lista)

rdd.take(5)

In [None]:
rdd1 = rdd.flatMap(lambda line: line.split(' '))

rdd1.take(20)

In [None]:
rdd3 = rdd.map(lambda x: limparTexto(x)).map(lambda x: removerStopwords(x))

rdd3.take(20)

In [None]:
rdd.take(5)

## Project

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
from collections import namedtuple

# Can only run this once. restart your kernel for any errors.

ssc = StreamingContext(sc, 10 )
sqlContext = SQLContext(sc)
socket_stream = ssc.socketTextStream("127.0.0.1", 5554)

lines = socket_stream.window( 1000 )

fields = ("text")

# Use Parenthesis for multiple lines or use \.
Tweet = namedtuple( 'Tweet', fields )
( lines.flatMap( lambda text: text.split( " " ) ) 
     #.filter( lambda word: word.lower().startswith("http") )  
     .map( lambda word: ( word.lower(), 1 ) ) 
     .reduceByKey( lambda a, b: a + b ) 
     .map( lambda rec: Tweet( rec[0], rec[1] ) ) 
     .foreachRDD( lambda rdd: rdd.toDF().sort( desc("count") ) 
     .limit(100).registerTempTable("tweets") ) ) # Registers to a table.

## Now run TweetListener.py

## Spark

In [None]:
ssc.start()    

In [None]:
import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
# Only works for Jupyter Notebooks!
%matplotlib inline 

In [None]:
help(Tweet)

In [None]:
Tweet.text

In [None]:
tweets = sqlContext.sql( 'Select * from tweets' )
df = tweets.toPandas()

df.head(15)

In [None]:
ssc.stop()  