# First Spark Streaming Example
_____

# Twitter Example
Set up the credentials for a twitter app at https://apps.twitter.com/
    
install python-twitter, a python library to connect your Python to the twitter dev account.

Begin by running the TweetRead.py file. Make sure to add your own IP Adress and your credential keys.

In [1]:
import findspark

In [2]:

findspark.init('/Users/kevinblum/Apache-Spark/spark-3.1.2-bin-hadoop3.2')

In [3]:

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml import PipelineModel
from pyspark.ml.feature import CountVectorizer

In [4]:
spark = spark = SparkSession \
    .builder \
    .appName("Twitter Streaming App") \
    .getOrCreate()

tweet_df = spark \
    .readStream \
    .format("socket") \
    .option("host", "127.0.0.1") \
    .option("port", 5556) \
    .load()

tweet_df_string = tweet_df.selectExpr("CAST(value AS STRING)")

In [5]:
tweet_df_string = tweet_df_string.withColumnRenamed("value" , "tweet")

In [6]:
writeTweet = tweet_df_string.writeStream.format("csv").\
    option("format" , "append").\
    option("path", "/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets").\
    option("checkpointLocation" , "checkpoints").\
    queryName("tweetquery"). \
    start()


In [7]:
import time
time.sleep(60) 
writeTweet.stop()

In [8]:
import pandas as pd

In [9]:
import os
import glob
import pandas as pd
os.chdir("/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets")

In [10]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]


In [11]:
dataframe = pd.DataFrame()
for i in all_filenames:
    if (os.stat(f"/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets/{i}").st_size != 0):
        try:
            d = pd.read_csv(f"/Users/kevinblum/BigDataProj/SparkTwitterStream/tweets/{i}" , engine="python")
            dt = d.T
            dt = dt.reset_index()
            dataframe = dataframe.append(dt)
        except:
            pass




In [12]:
dataframe = dataframe.rename(columns={"index" : "Tweet"})
dataframe = dataframe.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
dataframe = dataframe.reset_index(drop=True)

In [13]:

dataframe = dataframe[dataframe['Tweet'] != ""]
dataframe = dataframe[dataframe['Tweet'] != "."]
dataframe = dataframe[dataframe['Tweet'] != None]

dataframe = dataframe.reset_index(drop=True)


In [14]:
range1 = range(100, 100 + dataframe['Tweet'].size  )
list1 = list(range1)


In [15]:
dfg = pd.DataFrame(list1 , columns = ['tweet_id'])

In [16]:
dfg = dfg['tweet_id'].astype('str')

In [17]:
list_polarity =[]
for b in range(dataframe['Tweet'].size):
    list_polarity.append(0)
    

In [18]:
dfp = pd.DataFrame(list_polarity , columns = ['polarity'])

In [19]:
dataframe = dataframe.join(dfg)

In [20]:
dataframe = dataframe.join(dfp)

In [21]:
dataframe

Unnamed: 0,Tweet,tweet_id,polarity
0,#2 in jobs created,100,0
1,#1 in GDP growth,101,0
2,#2@joncoopertweets Prison. Even poor he can st...,102,0
3,IfRT @seungscience: WDYM CHILE'S ELECTED PRESI...,103,0
4,@POTUS first year ranks:,104,0
...,...,...,...
148,That should explainRT @KatiePavlich: A full va...,248,0
149,#1 in GDP growth,249,0
150,"If you hear nothing else I say tonight, hear t...",250,0
151,Real Estate &amp; https://t.co/uWbxV2v34tRT @d...,251,0


In [22]:
sparkDF = spark.createDataFrame(dataframe)

In [23]:
sparkDF.show(5)

+--------------------+--------+--------+
|               Tweet|tweet_id|polarity|
+--------------------+--------+--------+
|  #2 in jobs created|     100|       0|
|    #1 in GDP growth|     101|       0|
|#2@joncoopertweet...|     102|       0|
|IfRT @seungscienc...|     103|       0|
|@POTUS first year...|     104|       0|
+--------------------+--------+--------+
only showing top 5 rows



In [24]:
import nltk

In [25]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [26]:
from nltk.corpus.reader.wordnet import *
from pyspark.sql.functions import udf
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize import word_tokenize
from pyspark.sql.types import StringType,DoubleType,IntegerType
import pyspark.sql.functions as F
wn = nltk.WordNetLemmatizer()
worddict = set(nltk.corpus.words.words())


def preprocessing(text):
    wordset_n = set(wn.lemmatize(w, NOUN) for w in word_tokenize(text.lower().strip()))
    wordset_v = set(wn.lemmatize(w, VERB) for w in wordset_n)
    wordset = set(wn.lemmatize(w, ADJ) for w in wordset_v)
    wordset = wordset & worddict
    return ' '.join(list(wordset))


brand_udf=udf(preprocessing,StringType())
sparkDF=sparkDF.withColumn('text',brand_udf(sparkDF['Tweet']))


In [27]:
sparkDF.show(2)

+------------------+--------+--------+-------------+
|             Tweet|tweet_id|polarity|         text|
+------------------+--------+--------+-------------+
|#2 in jobs created|     100|       0|create job in|
|  #1 in GDP growth|     101|       0|    in growth|
+------------------+--------+--------+-------------+
only showing top 2 rows



In [28]:
from pyspark.ml import PipelineModel

model_1 = PipelineModel.load("/Users/kevinblum/BigDataProj/SparkTwitterStream/models/lr")


In [29]:
predictions = model_1.transform(sparkDF)

In [30]:
predictions1=predictions.toPandas()

In [34]:
predictions1.head(20)

Unnamed: 0,Tweet,tweet_id,polarity,text,words,rawFeatures,features,rawPrediction,probability,prediction
0,#2 in jobs created,100,0,create job in,"[create, job, in]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.166744221386362, -6.300236564373614, -2.146...","[0.39802952979647266, 5.640009693514986e-07, 3...",4.0
1,#1 in GDP growth,101,0,in growth,"[in, growth]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.0945542987474886, -6.298821976386088, -2.10...","[0.37430361626960834, 5.708928615935067e-07, 3...",4.0
2,#2@joncoopertweets Prison. Even poor he can st...,102,0,still he i for to ease leader the of oppositio...,"[still, he, i, for, to, ease, leader, the, of,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[10.516442958866426, -6.343465270928145, -5.55...","[0.9420404503732264, 4.486466629637303e-08, 9....",0.0
3,IfRT @seungscience: WDYM CHILE'S ELECTED PRESI...,103,0,this a our chile president elect be and,"[this, a, our, chile, president, elect, be, and]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5.585169699121602, -6.308799753058103, -0.256...","[0.15389210727097036, 1.0513106055084866e-06, ...",4.0
4,@POTUS first year ranks:,104,0,rank year first,"[rank, year, first]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.099287161197415, -6.300362026685774, -2.098...","[0.37729276990292876, 5.718534968237224e-07, 3...",4.0
5,#1 in GDP growth,105,0,in growth,"[in, growth]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.0945542987474886, -6.298821976386088, -2.10...","[0.37430361626960834, 5.708928615935067e-07, 3...",4.0
6,@POTUS first year ranks:,106,0,rank year first,"[rank, year, first]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.099287161197415, -6.300362026685774, -2.098...","[0.37729276990292876, 5.718534968237224e-07, 3...",4.0
7,Shes really doing it all to give help.,107,0,really all give to do help it,"[really, all, give, to, do, help, it]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.464546932274314, -6.305050750198819, -2.528...","[0.4479245423751756, 4.6896946584631055e-07, 2...",4.0
8,@POTUS first year ranks:,108,0,rank year first,"[rank, year, first]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.099287161197415, -6.300362026685774, -2.098...","[0.37729276990292876, 5.718534968237224e-07, 3...",4.0
9,https://t.cRT @MGwin46: Some perspective from ...,109,0,we look year former progress a last back persp...,"[we, look, year, former, progress, a, last, ba...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[6.80454636988744, -6.314005874689925, -3.6294...","[0.06609252363012201, 1.326897832300735e-07, 1...",4.0


In [None]:
spark.stop()

In [None]:
import json
predictions1=predictions1[['tweet_id','Tweet','polarity','prediction']]

In [None]:
from pymongo import MongoClient
cluster = MongoClient("mongodb+srv://kevblum7:mongokev7@cluster0.d5h1w.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
mydb = cluster["TwitterStream"]
mycol = mydb["Tweet"]
records = json.loads(predictions1.T.to_json()).values()
mydb.mycol.insert_many(records)