## imports

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:

spark = SparkSession.builder \
    .appName("KafkaStreamReader") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0') \
    .config("spark.sql.shuffle.partitions", 4) \
    .getOrCreate()

In [3]:
kafka_df  = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka_v2:9092") \
    .option("subscribe", "credit_card_trans") \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()

In [4]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
# Kafka message is in binary, so cast it to string
value_df = kafka_df.selectExpr("CAST(value AS STRING) as json_str")

In [23]:
schema = StructType([
    StructField("", StringType(), True),
    StructField("trans_date_trans_time", StringType(), True),
    StructField("cc_num", StringType(), True),
    StructField("merchant", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amt", StringType(), True),
    StructField("first", StringType(), True),
    StructField("last", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("lat", StringType(), True),
    StructField("long", StringType(), True),
    StructField("city_pop", StringType(), True),
    StructField("job", StringType(), True),
    StructField("dob", StringType(), True),
    StructField("trans_num", StringType(), True),
    StructField("unix_time", StringType(), True),
    StructField("merch_long", StringType(), True),
    StructField("is_fraud", StringType(), True),
    StructField("event_time",StringType(),True)
])

In [24]:
parsed_df = value_df.select(from_json(col("json_str"), schema).alias("data")).select("data.*")

In [29]:
parsed_df = parsed_df.withColumnRenamed("", "indx")

In [32]:
parsed_df.withColumn("is_fraud",col("is_fraud").cast("int")) \
        .withColumn("merch_long",col("merch_long").cast("double")) \
        .withColumn("unix_time",col("unix_time").cast("long")) \
        .withColumn("merch_long",col("merch_long").cast("double")) \
        .withColumn("city_pop",col("city_pop").cast("int")) \
        .withColumn("long",col("long").cast("double")) \
        .withColumn("lat",col("lat").cast("double")) \
        .withColumn("amt",col("amt").cast("double")) \
        .withColumn("trans_date_trans_time", to_timestamp("trans_date_trans_time", "yyyy-MM-dd HH:mm:ss")) \
        .withColumn("event_time", to_timestamp("event_time", "yyyy-MM-dd HH:mm:ss"))

DataFrame[indx: string, trans_date_trans_time: timestamp, cc_num: string, merchant: string, category: string, amt: double, first: string, last: string, gender: string, street: string, city: string, state: string, zip: string, lat: double, long: double, city_pop: int, job: string, dob: string, trans_num: string, unix_time: bigint, merch_long: double, is_fraud: int, event_time: timestamp]

In [25]:
stream = parsed_df.writeStream \
                .format("console") \
                .outputMode("append") 
                # .option("checkpointLocation", "checkpoint_dir_kafka")
                # .trigger(processingTime="20 seconds")

In [26]:
q = stream.start()

In [28]:
q.stop()

In [None]:
null - valid values - trem remove leading and lagging spaces - age - distance - city state of mershant using log and lat 

In [None]:
from pyspark.sql.functions import sha2, col, hash
df.withColumn("safe_hash", sha2(col("user_id"), 256))
hashed_df = parsed_df.withColumn("user_hash", hash(col("user_id")))