In [1]:
%pip install google-cloud-translate google-cloud-language
%pip install apache-beam==2.22.0.dev0

import argparse, os, json, logging
from datetime import datetime, timedelta
import json
import pandas as pd

import apache_beam as beam
from apache_beam.transforms import trigger
from apache_beam.io.gcp.internal.clients import bigquery
from apache_beam.options.pipeline_options import GoogleCloudOptions, PipelineOptions, SetupOptions, StandardOptions


import google.auth
from google.cloud import language_v1
from google.cloud.language_v1 import enums
from google.cloud import translate_v2 as translate

PROJECT_ID = 'maabel-testground'
OUTPUT_DATASET = 'tweet_nlp_demo'
OUTPUT_TABLE_UNAGG = 'processed_tweet_data'
OUTPUT_TABLE_AGG = 'aggregated_tweet_data'
INPUT_TOPIC = "projects/maabel-testground/topics/tweet-nlp-demo"

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Collecting apache-beam==2.22.0.dev0
[31m  ERROR: Could not find a version that satisfies the requirement apache-beam==2.22.0.dev0 (from versions: 0.6.0, 2.0.0, 2.1.0, 2.1.1, 2.2.0, 2.11.0, 2.12.0, 2.13.0, 2.14.0, 2.15.0, 2.16.0, 2.17.0, 2.18.0, 2.19.0, 2.20.0, 2.21.0)[0m
[31mERROR: No matching distribution found for apache-beam==2.22.0.dev0[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Comment out these two lines before running on Dataflow
from apache_beam.runners.interactive import interactive_runner
import apache_beam.runners.interactive.interactive_beam as ib

In [3]:
# Setting up the Beam pipeline options.
options = PipelineOptions()

# Sets the pipeline mode to streaming, so we can stream the data from PubSub.
options.view_as(StandardOptions).streaming = True

# Sets the project to the default project in your current Google Cloud environment.
# The project will be used for creating a subscription to the PubSub topic.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

In [4]:
# Comment out these two lines when ready to run on Cloud Dataflow
ib.options.capture_duration = timedelta(seconds=60)
p = beam.Pipeline(interactive_runner.InteractiveRunner(), options=options)

# Uncommment this line before submitting to Cloud Dataflow
# from apache_beam.runners import DataflowRunner
# options.view_as(StandardOptions).runner = 'DataflowRunner'
# google_cloud_options = options.view_as(GoogleCloudOptions)
# google_cloud_options.job_name = 'tweet-nlp-pipeline'
# google_cloud_options.staging_location = 'gs://maabel-testground/binaries'
# google_cloud_options.temp_location = 'gs://maabel-testground/temp'
# google_cloud_options.region = 'us-central1'
# p = beam.Pipeline(DataflowRunner(), options=options)

In [5]:
# So that Pandas Dataframes do not truncate data...
pd.set_option('display.max_colwidth', -1)

tweets = p | 'ReadTweet' >> beam.io.ReadFromPubSub(topic=INPUT_TOPIC) | beam.Map(json.loads)

In [6]:
ib.show(tweets)

In [7]:
def parse_fields(tweet):
    
    trim = {}
    
    trim['text'] = tweet['messages'][0]['data']['text']
    trim['created_at'] = datetime.strptime(tweet['messages'][0]['data']['created_at']
                              ,"%Y-%m-%dT%H:%M:%S")
    trim['source']=tweet['messages'][0]['data']['source']
    return trim

parsed_tweets = tweets | "Parse_Tweet" >> beam.Map(parse_fields)

In [8]:
ib.show(parsed_tweets)

In [9]:
def detect_language(tweet):
    
    translate_client = translate.Client()
    
    text = tweet['text']
    result = translate_client.detect_language(text)
    
    tweet['language'] = result['language']
    tweet['lang_confidence'] = result['confidence']
    
    return tweet

lang_tweets = parsed_tweets | "Detect_Language" >> beam.Map(detect_language)

In [10]:
ib.show(lang_tweets)

In [13]:
def analyze_sentiment(tweet):

    client = language_v1.LanguageServiceClient()

    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    if tweet['language'] in ['en', 'fr', 'de', 'it', 'pt', 'es']:
        
        language = tweet['language']
        document = {"content": tweet['text'], "type": type_, "language": language}

        encoding_type = enums.EncodingType.UTF8

        response = client.analyze_sentiment(document, encoding_type=encoding_type)
        
        tweet['score'] = response.document_sentiment.score
        tweet['magnitude'] = response.document_sentiment.magnitude
    
    else:
        
        tweet['score'] = None
        tweet['magnitude'] = None
    
    return tweet
        
analyzed_tweets = lang_tweets | "Detect_Language" >> beam.Map(analyze_sentiment)   

In [14]:
ib.show(analyzed_tweets)

In [15]:
windowed_tweets = analyzed_tweets | "Window" >> beam.WindowInto(beam.window.SlidingWindows(30, 10))

In [16]:
ib.show(windowed_tweets, include_window_info=True)

In [17]:
def create_source_key(tweet):
    
    if tweet['score'] != None:
        yield (tweet['source'], {'score': tweet['score'], 'magnitude': tweet['magnitude']})
        
prepped_tweets = windowed_tweets | "Create_Source_Key" >> beam.FlatMap(create_source_key)

In [18]:
ib.show(prepped_tweets, include_window_info=True)

In [19]:
class WeightedAverageFn(beam.CombineFn):
    def create_accumulator(self):
        return (0.0, 0)

    def add_input(self, sum_count, input):
        (sum, count) = sum_count
        return sum + input['score'] * input['magnitude'], count + 1

    def merge_accumulators(self, accumulators):
        sums, counts = zip(*accumulators)
        return sum(sums), sum(counts)

    def extract_output(self, sum_count):
        (sum, count) = sum_count
        return {'score': sum / count, 'count': count} if count else {'score':float('NaN'), 'count': 0}
    
aggregated_tweets = prepped_tweets | "Aggregate_Weighted_Score" >> beam.CombinePerKey(WeightedAverageFn())

In [20]:
ib.show(aggregated_tweets, include_window_info=True)

In [21]:
def parse_aggregation(agg_tweets):
    
    result = {}
    
    result['source'] = agg_tweets[0]
    result['score'] = agg_tweets[1]['score']
    result['count'] = agg_tweets[1]['count']
    
    return result


parsed_aggregated_tweets = aggregated_tweets | "Parse_Aggregated_Results" >> beam.Map(parse_aggregation)

In [22]:
ib.show(parsed_aggregated_tweets,include_window_info=True)

In [28]:
ib.show_graph(p)

In [27]:
## Do not run this cell until you are ready to execute the pipeline on Dataflow!

table_spec_unagg = bigquery.TableReference(
      projectId = PROJECT_ID,
      datasetId = OUTPUT_DATASET,
      tableId= OUTPUT_TABLE_UNAGG)
   
table_schema_unagg ='text:STRING, created_at:TIMESTAMP, source:STRING, language:STRING, lang_confidence:FLOAT64, score:FLOAT64, magnitude:FLOAT64'

bq_output_unagg = analyzed_tweets | 'WriteToBQ_Unagg'>> beam.io.WriteToBigQuery(table_spec_unagg,
                                                                           schema=table_schema_unagg,
                                                                           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                                                                           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

table_spec_agg = bigquery.TableReference(
      projectId = PROJECT_ID,
      datasetId = OUTPUT_DATASET,
      tableId= OUTPUT_TABLE_AGG)


table_schema_agg ='source:STRING, score:FLOAT64, count:INT64'


bq_output_agg = parsed_aggregated_tweets | 'WriteToBQ_Agg'>> beam.io.WriteToBigQuery(table_spec_agg,
                                                                                  schema=table_schema_agg,
                                                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                                                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)


p.run()
logging.getLogger().setLevel(logging.INFO)
