In [None]:
# pyspark functions
from pyspark.sql.functions import *
from pyspark.sql.types import *
# URL processing
import urllib

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType, DoubleType

In [None]:
%run /Users/joelcosta94i@gmail.com/data_cleaning

In [None]:
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format("csv")\
.option("header", "true")\
.option("sep", ",")\
.load("/FileStore/tables/authentication_credentials.csv")

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
def read_kinesis_stream(stream_name: str):
    '''
    Reads a stream with a given stream name using pre-determined Access and Security keys.

    Parameters
    ----------
    stream_name: str
        The name of the stream to be read
    '''
    df = spark \
        .readStream \
        .format('kinesis') \
        .option('streamName', stream_name) \
        .option('initialPosition','earliest') \
        .option('region','us-east-1') \
        .option('awsAccessKey', ACCESS_KEY) \
        .option('awsSecretKey', SECRET_KEY) \
        .load()
    return df

def convert_kinesis_stream_to_dataframe(stream_name: str, streaming_schema: pyspark.sql.types.StructType):
    '''
    Reads a kinesis data stream by calling `read_kinesis_stream` with a given stream name.
    Then converts said stream into a dataframe on which transformations can be performed using the given schema.

    Parameters
    ----------
    stream_name: str
        The name of the stream to be read.
    streaming_schema: pyspark.sql.types.StructType
        The schema of the dataframe containing the actual data.
        
    Returns
    -------
    df: pyspark.sql.dataframe.DataFrame
        The dataframe with the right schema.
    '''
    df = read_kinesis_stream(stream_name)
    df = df \
        .selectExpr("CAST(data as STRING)") \
        .select((from_json("data", streaming_schema).alias("data"))) \
        .select("data.*")
    return df

In [None]:
# Define a streaming schema using StructType
streaming_schema_pin = StructType([
    StructField("category", StringType(), True),
    StructField("description", StringType(), True),
    StructField("downloaded", LongType(), True),
    StructField("follower_count", StringType(), True),
    StructField("image_src", StringType(), True),
    StructField("index", LongType(), True),
    StructField("is_image_or_video", StringType(), True),
    StructField("poster_name", StringType(), True),
    StructField("save_location", StringType(), True),    
    StructField("tag_list", StringType(), True),
    StructField("title", StringType(), True),
    StructField("unique_id", StringType(), True),
])

streaming_schema_geo = StructType([
    StructField("country", StringType(), True),
    StructField("ind", LongType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("timestamp", StringType(), True),
])

streaming_schema_user= StructType([
    StructField("age", LongType(), True),
    StructField("date_joined", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("ind", LongType(), True),
    StructField("last_name", StringType(), True),
])


In [None]:
# Load the streaming data and convert it into appropriate dataframes using the appropriate schema
df_pin = convert_kinesis_stream_to_dataframe("streaming-0a6a638f5991-pin", streaming_schema_pin)
df_geo = convert_kinesis_stream_to_dataframe("streaming-0a6a638f5991-geo", streaming_schema_geo)
df_user = convert_kinesis_stream_to_dataframe("streaming-0a6a638f5991-user", streaming_schema_user)

In [None]:
# Clean the data
df_pin_clean = clean_pin_data(df_pin)
df_geo_clean = clean_geo_data(df_geo)
df_user_clean = clean_user_data(df_user)

In [None]:
# Write the three streams to delta tables

def write_df_to_delta_table(df: pyspark.sql.dataframe.DataFrame, table_name: str):
    '''
    Writes data stream to delta table

    Parameters
    ----------
    df : pyspark.sql.dataframe.DataFrame
        The streamed dataframe.
    table_name: str
        The string containing the name of the delta table to write the data into.
    '''
    df.writeStream \
        .format("delta") \
        .outputMode("append") \
        .option("checkpointLocation", f"/tmp/kinesis/{table_name}_checkpoints/") \
        .table(table_name)

In [None]:
# Pin data
write_df_to_delta_table(df_pin_clean, '0a6a638f5991_pin_table')
# Geo data
write_df_to_delta_table(df_geo_clean, '0a6a638f5991_geo_table')
# User data
write_df_to_delta_table(df_user_clean, '0a6a638f5991_user_table')