# First Spark Streaming Example
_____

# Twitter Example
Set up the credentials for a twitter app at https://apps.twitter.com/
    
install python-twitter, a python library to connect your Python to the twitter dev account.

Begin by running the TweetRead.py file. Make sure to add your own IP Adress and your credential keys.

In [1]:
import findspark

In [2]:
# your path will likely not have 'matthew' in it. Change it to reflect your path.
findspark.init('/Users/kevinblum/Apache-Spark/spark-3.1.2-bin-hadoop3.2')

In [3]:
# May cause deprecation warnings, safe to ignore, they aren't errors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml import PipelineModel
from pyspark.ml.feature import CountVectorizer

In [4]:
spark = spark = SparkSession \
    .builder \
    .appName("Twitter Streaming App") \
    .getOrCreate()

tweet_df = spark \
    .readStream \
    .format("socket") \
    .option("host", "127.0.0.1") \
    .option("port", 5556) \
    .load()

tweet_df_string = tweet_df.selectExpr("CAST(value AS STRING)")

In [5]:
tweet_df_string = tweet_df_string.withColumnRenamed("value" , "tweet")

In [6]:
writeTweet = tweet_df_string.writeStream.format("csv").\
    option("format" , "append").\
    option("path", "/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets.csv").\
    option("checkpointLocation" , "checkpoints").\
    queryName("tweetquery"). \
    start()


In [7]:
import time
time.sleep(30) 
writeTweet.stop()

In [8]:
import pandas as pd

In [9]:
import os
import glob
import pandas as pd
os.chdir("/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets.csv")

In [10]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]


In [11]:
dataframe = pd.DataFrame()
for i in all_filenames:
    if (os.stat(f"/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets.csv/{i}").st_size != 0):
        try:
            d = pd.read_csv(f"/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets.csv/{i}" , engine="python")
            dt = d.T
            dt = dt.reset_index()
            dataframe = dataframe.append(dt)
        except:
            pass

dataframe = dataframe.rename(columns={"index" : "tweet"})
dataframe = dataframe.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
dataframe = dataframe.reset_index(drop=True)


In [12]:
dataframe

Unnamed: 0,tweet
0,lmfao hot damn@QuantumVigilan1 @PMasalsky But ...
1,I wonder if hRT @Breaking911: BREAKING: Presid...
2,The pRT @pilitobar46: Madam @VPRT @elefaantz:...
3,Tu NO has fet res. Com a President no has fPre...
4,"Its Joe Biden, and I'm Vice President, and my ..."
5,Kamala Har@DennyMcCombs6 @bluegirlsrule1 @madd...
6,Washington https://t.co/MlSbsxWpof@63aMarsh @Y...
7,1. Not becoming president
8,Els que hem plantat cara hem estat els ciutadans.
9,No. No. No. No. No. No. No.


In [13]:

for y in range(dataframe.size):
    if dataframe.iat[y,0] == "":
        dataframe = dataframe.drop([y,0])

In [14]:
dataframe


Unnamed: 0,tweet
0,lmfao hot damn@QuantumVigilan1 @PMasalsky But ...
1,I wonder if hRT @Breaking911: BREAKING: Presid...
2,The pRT @pilitobar46: Madam @VPRT @elefaantz:...
3,Tu NO has fet res. Com a President no has fPre...
4,"Its Joe Biden, and I'm Vice President, and my ..."
5,Kamala Har@DennyMcCombs6 @bluegirlsrule1 @madd...
6,Washington https://t.co/MlSbsxWpof@63aMarsh @Y...
7,1. Not becoming president
8,Els que hem plantat cara hem estat els ciutadans.
9,No. No. No. No. No. No. No.


In [15]:
dataframe.reset_index(drop=True)

Unnamed: 0,tweet
0,lmfao hot damn@QuantumVigilan1 @PMasalsky But ...
1,I wonder if hRT @Breaking911: BREAKING: Presid...
2,The pRT @pilitobar46: Madam @VPRT @elefaantz:...
3,Tu NO has fet res. Com a President no has fPre...
4,"Its Joe Biden, and I'm Vice President, and my ..."
5,Kamala Har@DennyMcCombs6 @bluegirlsrule1 @madd...
6,Washington https://t.co/MlSbsxWpof@63aMarsh @Y...
7,1. Not becoming president
8,Els que hem plantat cara hem estat els ciutadans.
9,No. No. No. No. No. No. No.


In [16]:
sparkDF = spark.createDataFrame(dataframe)

In [17]:
sparkDF.show()

+--------------------+
|               tweet|
+--------------------+
|lmfao hot damn@Qu...|
|I wonder if hRT @...|
|The pRT @pilitoba...|
|Tu NO has fet res...|
|Its Joe Biden, an...|
|Kamala Har@DennyM...|
|Washington https:...|
|1. Not becoming p...|
|Els que hem plant...|
|No. No. No. No. N...|
|#AnditoTayoParaSa...|
|Really shows you ...|
|The speaker,  nor...|
|Kamala Harris Aid...|
|Harris aide Symon...|
|Go Fuck Yourself!...|
|Raise your hand i...|
|#ABSCBNChristmasS...|
|Had no opinion on...|
|President Donald ...|
+--------------------+
only showing top 20 rows



In [18]:
import nltk

In [19]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [20]:
from nltk.corpus.reader.wordnet import *
from pyspark.sql.functions import udf
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize import word_tokenize
from pyspark.sql.types import StringType,DoubleType,IntegerType
import pyspark.sql.functions as F
wn = nltk.WordNetLemmatizer()
worddict = set(nltk.corpus.words.words())



def preprocessing(text):
    wordset_n = set(wn.lemmatize(w, NOUN) for w in word_tokenize(text.lower().strip()))
    wordset_v = set(wn.lemmatize(w, VERB) for w in wordset_n)
    wordset = set(wn.lemmatize(w, ADJ) for w in wordset_v)
    wordset = wordset & worddict
    return ' '.join(list(wordset))


brand_udf=udf(preprocessing,StringType())
sparkDF=sparkDF.withColumn('text',brand_udf(sparkDF['tweet']))
sparkDF=sparkDF.withColumn('clean_len',F.length('text'))

In [21]:
from pyspark.ml.feature import  Tokenizer

In [22]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [23]:
sparkDF=tokenizer.transform(sparkDF)

In [24]:
from pyspark.ml.feature import CountVectorizer
count = CountVectorizer (inputCol="words", outputCol="rawFeatures")

In [25]:
model1=count.fit(sparkDF)

In [26]:
sparkDF=model1.transform(sparkDF)

In [27]:
from pyspark.ml.feature import  IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [28]:
idfModel = idf.fit(sparkDF)
sparkDF = idfModel.transform(sparkDF)

In [33]:
from pyspark.ml.classification import RandomForestClassifier
rf_classifier=RandomForestClassifier(labelCol='polarity', featuresCol="features")

In [34]:
sparkDF.show()

+--------------------+--------------------+---------+--------------------+--------------------+--------------------+
|               tweet|                text|clean_len|               words|         rawFeatures|            features|
+--------------------+--------------------+---------+--------------------+--------------------+--------------------+
|lmfao hot damn@Qu...|that for but damn...|      171|[that, for, but, ...|(331,[0,1,2,3,4,6...|(331,[0,1,2,3,4,6...|
|I wonder if hRT @...|pandemic new i to...|      158|[pandemic, new, i...|(331,[0,1,2,3,4,5...|(331,[0,1,2,3,4,5...|
|The pRT @pilitoba...|skit school only ...|       91|[skit, school, on...|(331,[0,1,2,3,4,8...|(331,[0,1,2,3,4,8...|
|Tu NO has fet res...|we he to consiste...|      238|[we, he, to, cons...|(331,[0,1,2,3,4,5...|(331,[0,1,2,3,4,5...|
|Its Joe Biden, an...|name kamala i my ...|       45|[name, kamala, i,...|(331,[0,3,4,8,9,1...|(331,[0,3,4,8,9,1...|
|Kamala Har@DennyM...|matter fact yes k...|       22|[matter, fa

In [None]:
# Can only run this once. restart your kernel for any errors.
sc = SparkContext()

In [None]:
ssc = StreamingContext(sc, 10 )
sqlContext = SQLContext(sc)

In [None]:
socket_stream = ssc.socketTextStream("127.0.0.1", 5556)

In [None]:
lines = socket_stream.window( 20 )

In [None]:
def process(rdd):
    #print("========= %s =========" % str(time))
    try:
        # Get the singleton instance of SparkSession
        spark = getSparkSessionInstance(rdd.context.getConf())

        # Convert RDD[String] to RDD[Row] to DataFrame
        rowRdd = rdd.map(lambda w: Row(word=w))
        wordsDataFrame = spark.createDataFrame(rowRdd)

        # Creates a temporary view using the DataFrame
        wordsDataFrame.createOrReplaceTempView("words")

        # Do word count on table using SQL and print it
        wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word")
        return wordCountsDataFrame
       # wordCountsDataFrame.show()
    except:
        pass

lines.foreachRDD(process)

In [None]:
def process(rdd):
    rdd.toDF()
    
lines.foreachRDD(process)

In [None]:
print(lines)

In [None]:
from collections import namedtuple
fields = ["tweet", "count"]
Tweet = namedtuple( 'Tweet', fields )

In [None]:
# Use Parenthesis for multiple lines or use \.
( lines.flatMap( lambda text: text.split( " " ) ) #Splits to a list
     .filter( lambda word: word.lower().startswith("#") ) # Checks for hashtag calls
     .map( lambda word: ( word.lower(), 1 ) ) # Lower cases the word
     .reduceByKey( lambda a, b: a + b ) # Reduces
     .map( lambda rec: Tweet( rec[0], rec[1] ) ) # Stores in a Tweet Object
     .foreachRDD( lambda rdd: rdd.toDF().sort( desc("count") ) # Sorts Them in a DF
     .limit(10).registerTempTable("tweets") ) ) # Registers to a table. 

__________
### Run the TweetRead.py file at this point
__________

In [None]:
import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas
# Only works for Jupyter Notebooks!
%matplotlib inline 

In [None]:
ssc.start()

In [None]:
print(lines)

In [None]:
count = 0
while count < 10:
    
    time.sleep( 3 )
    top_10_tweets = sqlContext.sql( 'Select tag, count from tweets' )
    top_10_df = top_10_tweets.toPandas()
    display.clear_output(wait=True)
    plt.figure( figsize = ( 10, 8 ) )
    sns.barplot( x="count", y="tag", data=top_10_df)
    plt.show()
    count = count + 1

In [None]:
ssc.stop()

In [None]:
wordCountsDataFrame