In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Columbia EECS E6893 Big Data Analytics
"""
This module is the spark streaming analysis process.


Usage:
    If used with dataproc:
        gcloud dataproc jobs submit pyspark --cluster <Cluster Name> twitterHTTPClient.py

    Create a dataset in BigQurey first using
        bq mk bigdata_sparkStreaming

    Remeber to replace the bucket with your own bucket name


Todo:
    1. hashtagCount: calculate accumulated hashtags count
    2. wordCount: calculate word count every 60 seconds
        the word you should track is listed below.
    3. save the result to google BigQuery

"""

from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row,SQLContext
import sys
import requests
import time
import subprocess
import re
from google.cloud import bigquery

# global variables
bucket = "bucket-hw3"    # TODO : replace with your own bucket name
output_directory_hashtags = 'gs://{}/hadoop/tmp/bigquery/pyspark_output/hashtagsCount'.format(bucket)
output_directory_wordcount = 'gs://{}/hadoop/tmp/bigquery/pyspark_output/wordcount'.format(bucket)

# output table and columns name
output_dataset = ''                     #the name of your dataset in BigQuery
output_table_hashtags = 'hashtags'
columns_name_hashtags = ['hashtags', 'count']
output_table_wordcount = 'wordcount'
columns_name_wordcount = ['word', 'count', 'time']

# parameter
IP = 'localhost'    # ip port
PORT = 9001       # port

STREAMTIME = 600          # time that the streaming process runs

WORD = ['data', 'spark', 'ai', 'movie', 'good']     #the words you should filter and do word count

In [2]:
def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)
    
def hashtagCount(words):
    """
    Calculate the accumulated hashtags count sum from the beginning of the stream
    and sort it by descending order of the count.
    Ignore case sensitivity when counting the hashtags:
        "#Ab" and "#ab" is considered to be a same hashtag
    You have to:
    1. Filter out the word that is hashtags.
       Hashtag usually start with "#" and followed by a series of alphanumeric
    2. map (hashtag) to (hashtag, 1)
    3. sum the count of current DStream state and previous state
    4. transform unordered DStream to a ordered Dstream
    Hints:
        you may use regular expression to filter the words
        You can take a look at updateStateByKey and transform transformations
    Args:
        dstream(DStream): stream of real time tweets
    Returns:
        DStream Object with inner structure (hashtag, count)
    """

    # TODO: insert your code here
    import re
    tagCounts = words.filter(lambda word: word.lower().startswith("#") and re.match('^[a-zA-Z0-9]+$',word[1:])).\
                map(lambda word:(word.lower(),1))
    tagCounts = tagCounts.updateStateByKey(aggregate_tags_count)
    return tagCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))

In [3]:
# Spark settings
conf = SparkConf()
conf.setMaster('local[2]')
conf.setAppName("TwitterStreamApp")

# create spark context with the above configuration
#sc = SparkContext(conf=conf)
sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")

# create sql context, used for saving rdd
sql_context = SQLContext(sc)

# create the Streaming Context from the above spark context with batch interval size 5 seconds
ssc = StreamingContext(sc, 5)
# setting a checkpoint to allow RDD recovery
ssc.checkpoint("~/checkpoint_TwitterApp")

# read data from port 9001
dataStream = ssc.socketTextStream(IP, PORT)
#dataStream.pprint()

words = dataStream.flatMap(lambda line: line.split(" "))

# calculate the accumulated hashtags count sum from the beginning of the stream
topTags = hashtagCount(words)
topTags.pprint()

#     # Calculte the word count during each time period 60s
#     wordCount = wordCount(words)
#     wordCount.pprint()

# save hashtags count and word count to google storage
# used to save to google BigQuery
# You should:
#   1. topTags: only save the lastest rdd in DStream
#   2. wordCount: save each rdd in DStream
# Hints:
#   1. You can take a look at foreachRDD transformation
#   2. You may want to use helper function saveToStorage
#   3. You should use save output to output_directory_hashtags, output_directory_wordcount,
#       and have output columns name columns_name_hashtags and columns_name_wordcount.
# TODO: insert your code here



In [None]:
  # start streaming process, wait for 600s and then stop.
    ssc.start()
    time.sleep(STREAMTIME)
    ssc.stop(stopSparkContext=False, stopGraceFully=True)

    # put the temp result in google storage to google BigQuery
    # saveToBigQuery(sc, output_dataset, output_table_hashtags, output_directory_hashtags)
    # saveToBigQuery(sc, output_dataset, output_table_wordcount, output_directory_wordcount)


-------------------------------------------
Time: 2019-10-30 14:53:30
-------------------------------------------

-------------------------------------------
Time: 2019-10-30 14:53:30
-------------------------------------------

-------------------------------------------
Time: 2019-10-30 14:53:35
-------------------------------------------
Rob Zombie really felt good about his Halloween 1&amp;2 work. Like, he actually directed the movie then watched it &amp; sa… https://t.co/WsGF68N18BHey Everyone! If you haven't already, make sure you go to https://t.co/j8kWvQ5009 and pre-order your DVD of my movi… https://t.co/zATRXkLaC4RT @soompi: Voices Of #RedVelvet’s #Irene And #NCT’s #Taeyong Added To AI Speakers Developed By SK Telecom And SM Entertainment
https://t.c…RT @SparkNotes: Cute date ideas:

- Seeing a movie
- Exploring the moors together
- Dying young
- Haunting your lover
- Walking hand-in-han…RT @Yousssqlf: 5k follow 0 vanne https://t.co/rNRPyCI0sLRT @chipro: Some people asked ab

-------------------------------------------
Time: 2019-10-30 14:54:00
-------------------------------------------
('#bigil', 8)
('#taeyeon', 5)
('#ai', 5)
('#jimininlove', 3)
('#spark', 3)
('#viswasam', 3)
('#ml', 2)
('#machinelearning', 2)
('#petta', 2)
('#irene', 1)
...

-------------------------------------------
Time: 2019-10-30 14:54:05
-------------------------------------------
@evankirstel @ipfconline1 @SpirosMargaris @Har…Hardest question of all time..@ENERGY has awarded a $6.9 million grant to Blue Wave AI Labs to develop predictive models of reactor components t… https://t.co/Fj0IIgLu0eRT @NisaLocally: Week 4 of @CadburyUK's 12 Weeks of Xmas has now begun! 🎅 This week we are giving away 9 @TheYankeeCandle Gift Sets! RT+FOL…RT @Dora_Winifred_: John Witherspoon is part of so much black film + tv culture it doesn’t make any sense. From...

Every Friday Movie
The…RT @MMM100_kemaco: # # # # # # # 
 Full Video : https://t.co/YVEMqM2fZL
# # # # # # # 
 More Videos : https://t.co/3h

-------------------------------------------
Time: 2019-10-30 14:54:25
-------------------------------------------
#Bigil #Viswasam #Petta or Ot…RT @sir_leksyd: Was shooting the BTS of a movie few weeks ago, this little boy came around and said "Boda, oya eya mi". I took a few shots,…RT @mybizon: A6: The obvious is all the brands engaging in voice-activated communications, however on another beat....I think @Sephora real…RT @sciencepolicy: Data trusts have emerged as a possible solution for managing the vast amount of data that smart cities will generate.  H…@jordanniee @cannawitchx My favorite Disney movie. This is a good meme. Saving it.RT @KirkDBorne: This 10-page (PDF) #DataScience Cheat Sheet covers concepts in Statistical Learning, #MachineLearning, #DeepLearning, Proba…RT @ShabihShakeel: (1/3) We are pleased to present the long-awaited structure of the Fanconi anaemia core complex, an E3 ligase involved in…RT @batmanjuuls: If cmbyn is ur favorite “lgbt movie” ur homophobic as hel

-------------------------------------------
Time: 2019-10-30 14:54:45
-------------------------------------------
joaquin: ok. https://t.co/vDnhNtT7EjRT @RathnaveluDop: Dear @aryasukku All the best for  #AA20 !! Expecting another Cult classic hit like Rangasthalam!! Both of us are gonna m…RT @darkman_g: Blood I wanna cry RIP TO A MF LEGEND https://t.co/CyOYjRVH8uLink3D raises $7M to fund international expansion of Additive MES platform: New York-based additive manufacturing w… https://t.co/EvrRUuDfH6Trynna have a spooky movie night with my spooky friends eating some spooky treatsRT @RonMFlores: IMO this is by far his best scene in Friday. "The Live To Fight Another Day" scene more famous, but Craig took the strap wi…RT @fchollet: A way to describe intelligence is that it is the power to produce abstraction. AI in the true sense would be Autonomous Abstr…Basically, the movie who invented pop cultureRT @FarhatullahB: The only line in it one may be able to understand “GHQ is at Rwp not is

-------------------------------------------
Time: 2019-10-30 14:55:05
-------------------------------------------
But I nee… https://t.co/r5rdXpgAuxshe said kai songs and didn’t even mention yixing’s honey ep and wbh 😭😭😭RT @JRubinBlogger: I look forward to House and Senate votes on impeachment. The tiny-tiny # of votes to impeach/remove will represent the o…RT @ultchanyeolpark: The way he made it into a whole ass movie for just a teaser....imagine how great the whole series is going to be 

#LO…RT @kiwicooI: @DexHinton when that video of that kid crying about being bullied went viral and celebrities invited his family to movie prem…RT @ThalaFC_: 1st movie released in Thursday. Movie Collected 140+ C worldwide said AMR.

#ThalaAJITH Always Trendsetter 

@vishnu_dir &amp; @t…A whole circusRT @ShawnRyanTV: Just a reminder to aspiring writers that the @WGAWest library has thousands of tv and movie scripts to provide you not onl…RT @ThalaAjith_FR: Blockbuster movie of This year...?

#Viswas

-------------------------------------------
Time: 2019-10-30 14:55:30
-------------------------------------------
The…RT @Kitcat75608567: Wanna see my wet tight pussy? Just retweet, like, &amp; make sure ur following me &amp; I’ll show you! DM’s are now open! #horn…RT @ellieaddi: lol so many people (literally accounts with 600k+) have been like "why should i give credit its a movie character" its reall…@CCLoos @pinkrocktopus @LilyMarsWrites I often imagined she was manipulated to stay ...with her ambition dangled li… https://t.co/rAphTFcsynRT @art_bing: I wish they made a metroid movie that's just an kid friendly version of aliens tha would be radi need a beautiful shorty for my friend, he’s 5’9, drives a Honda Civic Si, will spark you up if you throw 5, plays… https://t.co/WsT2vkSwRZthe movie sucked. the soundtracks were bombs.

https://t.co/PfwVkBilIZ@KMR31871 Have we not disbarred Matt Gaetz yet? 

Call the Florida Bar:  866.352.0707

File a disciplinary complain… https://t.co/wbYJ8

In [None]:
def wordCount(words):
    """
    Calculte the count of 5 sepcial words in 60 seconds for every 60 seconds (window no overlap)
    Your should:
    1. filter the words, case insensitive.
    2. count the word during a special window size
    3. add a time related mark to the output of each window, ex: a datetime type
    Hints:
        You can take a look at reduceByKeyAndWindow transformation
        Dstream is a series of rdd, each RDD in a DStream contains data from a certain interval
        You may want to take a look of transform transformation of DStream when trying to add a time
    Args:
        dstream(DStream): stream of real time tweets
    Returns:
        DStream Object with inner structure (word, count, time)
    """

    # TODO: insert your code here
    pass

In [2]:
# Helper functions
def saveToStorage(rdd, output_directory, columns_name, mode):
    """
    Save each RDD in this DStream to google storage
    Args:
        rdd: input rdd
        output_directory: output directory in google storage
        columns_name: columns name of dataframe
        mode: mode = "overwirte", overwirte the file
              mode = "append", append data to the end of file
    """
    if not rdd.isEmpty():
        (rdd.toDF( columns_name ) \
        .write.save(output_directory, format="json", mode=mode))


def saveToBigQuery(sc, output_dataset, output_table, directory):
    """
    Put temp streaming json files in google storage to google BigQuery
    and clean the output files in google storage
    """
    files = directory + '/part-*'
    subprocess.check_call(
        'bq load --source_format NEWLINE_DELIMITED_JSON '
        '--replace '
        '--autodetect '
        '{dataset}.{table} {files}'.format(
            dataset=output_dataset, table=output_table, files=files
        ).split())
    output_path = sc._jvm.org.apache.hadoop.fs.Path(directory)
    output_path.getFileSystem(sc._jsc.hadoopConfiguration()).delete(
        output_path, True)