# This notebook is created to simulate a task CDC (Mysql) - Kafka Topic - local csv file - Spark Structured Stream data flow and data aggregation process

## Getting Needed Modules and Initiation of Spark Session

### Adding folder path to SYS to gather dat module into notebook

In [1]:
import sys
sys.path.insert(0, 'C:/Users/Lenovo/Desktop/exam_eti/containerized_tool/data_analysis_tool/src')

### Needed python modules

In [2]:
# mysql_analyzer is a module consisting of data transformation methods written by myself
import mysql_analyzer
import mysql.connector
import pandas as pd
from mysql.connector.errors import Error
# Dat is a module consisting of data transformation methods written by myself
import dat
from faker import Faker
import random
import json
from json import dumps
from json import loads
import time
from datetime import datetime
from kafka import KafkaProducer, TopicPartition, KafkaConsumer
import xlsxwriter
import openpyxl

working


## Creating Spark Session & Gathering Needed Pyspark Modules

In [3]:
import os
SCALA_VERSION = '2.12'
SPARK_VERSION = '3.1.2'
import findspark
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_{SCALA_VERSION}:{SPARK_VERSION} pyspark-shell'
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType, TimestampType, StructField
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window

### Creating Spark session with gathering stream package and setting up some configs around

In [4]:
spark = SparkSession.builder\
        .master('local[*]')\
        .config("spark.streaming.stopGracefullyOnShutdown", True) \
        .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') \
        .config("spark.sql.shuffle.partitions", 4) \
        .appName("myAppName")\
        .getOrCreate()

## Initiating mysql_analyzer class

In [5]:
mysql_works = mysql_analyzer.mysql_profiler('localhost',os.environ['MYSQLSERVER_USER'],
                os.environ['MYSQLSERVER_PASS'],'sakila')

## Created CDC table for target mysql table (users_change_events)

In [6]:
mysql_works.multiple_dataset_apply_mysql_insert("CREATE TABLE IF NOT EXISTS \
sakila.users_change_events (log_id BIGINT AUTO_INCREMENT,\
  event_type      TEXT,\
  event_timestamp TIMESTAMP,\
  user_id         INT,\
  user_name       TEXT,\
  user_email      TEXT,\
  PRIMARY KEY (log_id))")

'INSERT INTO STATEMENT EXECUTED'

## Created trigger that insert CDC transactions into CDC table created on previous step (for insert events on customer table)

In [7]:
mysql_works.multiple_dataset_apply_mysql_insert("CREATE TRIGGER IF NOT EXISTS sakila.user_insert_capture AFTER INSERT ON sakila.customer FOR EACH ROW \
  BEGIN INSERT INTO sakila.users_change_events \
  (event_type, \
   event_timestamp, \
   user_id, \
   user_name, \
   user_email) \
 VALUES ( \
   'INSERT', \
   now(), \
   user_id, \
   user_name, \
   user_email); \
  END;") 

'INSERT INTO STATEMENT EXECUTED'

## Created lastest cdc timestamp holding table 
### We are reading cdc last timestamp from previous data write action and saving into below created table

In [8]:
mysql_works.multiple_dataset_apply_mysql_insert("CREATE TABLE IF NOT EXISTS \
sakila.latest_cdc_timestamp (log_id BIGINT AUTO_INCREMENT,\
  event_type      TEXT,\
  event_timestamp TIMESTAMP,\
  PRIMARY KEY (log_id))")

'INSERT INTO STATEMENT EXECUTED'

## Kafka script to write data from mysql to kafka topic by catching CDC on mysql table

### Producer Kafka

In [9]:
# Inserting into latest_cdc_timestamp table, latest CDC timestamp from CDC table to be able to detect if any new CDC occurred
mysql_works.multiple_dataset_apply_mysql_insert(f"INSERT INTO sakila.latest_cdc_timestamp \
                                                (event_type,event_timestamp) \
                                                VALUES ('{mysql_works.multiple_dataset_apply_mysql_query('SELECT event_type FROM users_change_events ORDER BY event_timestamp DESC LIMIT 1')[0][0]}','{mysql_works.multiple_dataset_apply_mysql_query('SELECT event_timestamp FROM users_change_events ORDER BY event_timestamp DESC LIMIT 1')[0][0]}')")

# Creating fake records to create data flow to MYSQL db
mysql_works.fake_record_creator_sakila()

# Getting CDC timestamps from related table to use on below if statement
latest_saved_cdc_log = mysql_works.multiple_dataset_apply_mysql_query('SELECT max(event_timestamp) \
                                                                      FROM latest_cdc_timestamp')[0][0]

latest_real_cdc_log = mysql_works.multiple_dataset_apply_mysql_query('SELECT max(event_timestamp) \
                                                                     FROM users_change_events')[0][0]

# If CDC occurred instantiating KafkaProducre class and saving data into Kafka topic
if latest_saved_cdc_log < latest_real_cdc_log:
    producer = KafkaProducer(
        bootstrap_servers='settled-terrapin-12518-eu2-kafka.upstash.io:9092',
        sasl_mechanism='SCRAM-SHA-256',
        security_protocol='SASL_SSL',
        sasl_plain_username='c2V0dGxlZC10ZXJyYXBpbi0xMjUxOCTBb5AEffUiTulATzsbFtDRxbvhkO0Wsnc',
        sasl_plain_password='N2E2ZGVjY2UtZDY4YS00MjM4LTk5NTktMjU1OTRiZWQ4Y2Ix',
        value_serializer = lambda m : dumps(m, default=str).encode("utf-8")
        # api_version_auto_timeout_ms=100000,    
    )

    for record in mysql_works.multiple_dataset_apply_mysql_query(f'SELECT * FROM customer WHERE last_update > "{latest_saved_cdc_log}"'):
        data_dict = {"customer_id" : record[0],"store_id" : record[1],"first_name" : record[2] \
                     ,"first_name" : record[3],"email" : record[4],"address_id" : record[5] \
                      ,  "last_update" : record[8]}
        producer.send("mysql_write",data_dict) 
    producer.close()

1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479


### Saved data from kafka topic into a local xlsx file (Append method use to get only new records)

In [10]:
topic = 'mysql_write'
tp = TopicPartition(topic,0)

consumer = KafkaConsumer(
    bootstrap_servers='settled-terrapin-12518-eu2-kafka.upstash.io:9092',
    sasl_mechanism='SCRAM-SHA-256',
    security_protocol='SASL_SSL',
    sasl_plain_username='c2V0dGxlZC10ZXJyYXBpbi0xMjUxOCTBb5AEffUiTulATzsbFtDRxbvhkO0Wsnc',
    sasl_plain_password='N2E2ZGVjY2UtZDY4YS00MjM4LTk5NTktMjU1OTRiZWQ4Y2Ix',
    # group_id='bacak',
    auto_offset_reset='earliest',
    # max_poll_interval_ms=30,
    value_deserializer=lambda x: loads(x.decode('utf-8'))
)

# Configs to get latest offset
consumer.assign([tp])

# obtain the last offset value
consumer.seek_to_end(tp)
lastOffset = consumer.position(tp)

# Configs to get latest offset
consumer.seek_to_beginning(tp)   

# Saved topic messages into dataframe
df = pd.DataFrame()

# Stopped reading at latest offset
for message in consumer:
    if message.offset == lastOffset - 1:
        break
    else:
        message = message.value;
        df_raw = pd.json_normalize(message, max_level=0)
        df = pd.concat([df, df_raw])


df.to_csv('stream_df.csv', index=False) 


consumer.close()

### Read saved csv as spark stream source

In [11]:
## Source Schema
df_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("store_id", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("address_id", StringType(), True),
    StructField("latest_update", TimestampType(), True)
])

In [12]:
csvDF = spark \
    .readStream \
    .option("sep", ",") \
    .schema(df_schema) \
    .csv("C:/Users/Lenovo/Desktop/exam_eti/containerized_tool/data_analysis_tool/src/playground_notebooks")

In [13]:
csvDF.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- address_id: string (nullable = true)
 |-- latest_update: timestamp (nullable = true)



### Aggregation on stream data (Counting store_id customer qty. with in 30 seconds of tumbling windows)

In [14]:
windowedCounts = csvDF \
    .withWatermark("latest_update", "1 minutes") \
    .groupBy(F.window(csvDF.latest_update, "30 seconds"),
        csvDF.store_id) \
    .count()

In [15]:
windowedCounts.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- store_id: string (nullable = true)
 |-- count: long (nullable = false)



### Creating writestream query to query it by spark sql on next step

In [16]:
windowedCounts \
    .writeStream \
    .queryName("store_id_agg") \
    .outputMode("complete") \
    .format("memory") \
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x2b51d3bad50>

In [18]:
spark.sql("select * from store_id_agg").show()

+--------------------+--------+-----+
|              window|store_id|count|
+--------------------+--------+-----+
|{2024-06-28 15:11...|       1|    5|
|{2024-06-28 15:15...|       1|    4|
|{2024-06-28 15:38...|       1|    4|
|{2024-06-28 15:39...|       1|    4|
|{2024-06-28 15:19...|       1|    5|
|{2024-06-28 16:10...|       2|    2|
|{2024-06-28 15:43...|       1|    5|
|{2024-07-01 13:35...|       2|    2|
|{2024-06-28 14:34...|       1|    2|
|{2024-07-01 12:53...|       1|    2|
|{2024-07-01 12:48...|       1|    1|
|{2024-07-01 13:24...|       2|    1|
|{2024-07-01 12:13...|       1|    2|
|{2024-07-01 12:59...|       2|    3|
|{2024-07-01 13:38...|       1|    2|
|{2024-07-01 13:51...|       2|    2|
|{2024-06-28 15:20...|       1|    4|
|{2024-06-28 15:19...|       1|    5|
|{2024-07-01 12:50...|       2|    2|
|{2024-06-28 15:36...|       1|    4|
+--------------------+--------+-----+
only showing top 20 rows

