In [0]:
%run "../Configuration/config file"

In [0]:
from pyspark.sql.types import StructField, StructType, LongType, IntegerType, StringType, DoubleType, TimestampType
from pyspark.sql import functions as f

class Bronze():

    def __init__(self):
        obj_conf = ConfigModule()
        self.checkpoint_dir = f"{obj_conf.base_checkpoint_path}/checkpoint"
        self.raw_file_dir = f"{obj_conf.base_file_path}/raw"
        self.catalog = obj_conf.environment
        self.schema = obj_conf.db_name
        self.bootstrap_server = obj_conf.kafka_bootstrap_servers
        self.username = obj_conf.kafka_username
        self.password = obj_conf.kafka_password
    
    def get_registered_user(self, once=True, processingTime='10 seconds'):
        print('Consuming registered user details...', end='')
        self.reg_user_schema = StructType([
                                            StructField('user_id', LongType()),
                                            StructField('device_id', LongType()),
                                            StructField('mac_address', StringType()),
                                            StructField('registration_timestamp', StringType())
                                        ])
        df = spark.readStream.format('cloudFiles')\
                            .option('cloudFiles.format', 'csv')\
                            .option('maxFilePerTrigger', 1)\
                            .option('header', True)\
                            .schema(self.reg_user_schema)\
                            .load(f'{self.raw_file_dir}/reg_user')
        new_col_df = df.withColumn('load_time', f.current_timestamp())\
                       .withColumn('source_file', f.input_file_name())
        reg_user_Squery = new_col_df.writeStream.queryName('bz_registered_user_stream')\
                            .format('delta')\
                            .option('checkpointLocation', f'{self.checkpoint_dir}/cp_registered_user')\
                            .outputMode('append')
        if once:
            reg_user_Squery = reg_user_Squery.trigger(availableNow=True).toTable(f'{self.catalog}.{self.schema}.bz_registered_users')
        else:
            reg_user_Squery.trigger(processingTime=processingTime).toTable(f'{self.catalog}.{self.schema}.bz_registered_users')
        print('Done.')
        #reg_user_Squery.awaitTermination()

    def get_gym_logins(self, once=True, processingTime='10 seconds'):
        print('Consuming gym login details...', end='')
        self.gym_login_schema = StructType([
                                            StructField('mac_address', StringType()),
                                            StructField('gym', LongType()),
                                            StructField('login', StringType()),
                                            StructField('logout', StringType())
                                        ])
        df = spark.readStream.format('cloudFiles')\
                            .option('cloudFiles.format', 'csv')\
                            .option('maxFilePerTrigger', 1)\
                            .option('header', True)\
                            .schema(self.gym_login_schema)\
                            .load(f'{self.raw_file_dir}/gym_login')
        new_col_df = df.withColumn('load_time', f.current_timestamp())\
                       .withColumn('source_file', f.input_file_name())
        gym_login_Squery = new_col_df.writeStream.queryName('bz_gym_logins_stream')\
                            .format('delta')\
                            .option('checkpointLocation', f'{self.checkpoint_dir}/cp_gym_logins')\
                            .outputMode('append')
        if once:
            gym_login_Squery.trigger(availableNow=True).toTable(f'{self.catalog}.{self.schema}.bz_gym_logins')
        else:
            gym_login_Squery.trigger(processingTime=processingTime).toTable(f'{self.catalog}.{self.schema}.bz_gym_logins')
        print('Done.')
        #gym_login_Squery.awaitTermination()
        
    
    def get_kafka_multiplex(self, once=True, processingTime='10 seconds'):
        print('Consuming kafka multiplex details...', end='')
        topic = "user_info, bpm, workout"
        kafka_df = spark.readStream.format('kafka')\
                            .option('kafka.bootstrap.servers', self.bootstrap_server)\
                            .option('kafka.security.protocol', 'SASL_SSL')\
                            .option('kafka.sasl.mechanism', 'PLAIN')\
                            .option('kafka.sasl.jaas.config', f"""org.apache.kafka.common.security.plain.PlainLoginModule required username="{self.username}" password="{self.password}";""")\
                            .option('maxOffsetPerTrigger', 1)\
                            .option('subscribe', topic)\
                            .option('startingOffsets', 'earliest')\
                            .load()
        kafka_df = kafka_df.select(f.col('key').cast('string'), f.col('value').cast('string'), 'topic', 'partition', 'offset', f.col('timestamp').cast('bigint'))
        date_df = spark.table(f'{self.catalog}.{self.schema}.date_lookup').select('date', 'week_part')
        updated_df = kafka_df.join(date_df, f.to_date(f.from_unixtime(kafka_df.timestamp))==date_df.date, 'left')\
                            .withColumn('load_time', f.current_timestamp())\
                            .withColumn('source_file', f.input_file_name())
        kafka_multiplex_Squery = updated_df.writeStream.queryName('bl_kafka_multiplex_stream')\
                            .format('delta')\
                            .option('checkpointLocation', f'{self.checkpoint_dir}/cp_kafka_multiplex_logins')\
                            .outputMode('append')
        if once:
            kafka_multiplex_Squery.trigger(availableNow=True).toTable(f'{self.catalog}.{self.schema}.bz_kafka_multiplex')
        else:
            kafka_multiplex_Squery.trigger(processingTime=processingTime).toTable(f'{self.catalog}.{self.schema}.bz_kafka_multiplex')
        print('Done.')
        #kafka_multiplex_Squery.awaitTermination()
    
    def launcher(self, once=True, processingtime='5 seconds'):
        self.get_registered_user(once, processingtime)
        self.get_gym_logins(once, processingtime)
        self.get_kafka_multiplex(once, processingtime)
        for stream in spark.streams.active:
            stream.awaitTermination()

#obj = Bronze()
#obj.launcher()


In [0]:
class BronzeTestSuite():

    def __init__(self):
        obj_conf = ConfigModule()
        self.catalog = obj_conf.environment
        self.schema = obj_conf.db_name

    def assert_fn(self, table_name, filter, expected_count):
        print(f'Testing bronze layer - {self.catalog}.{self.schema}.{table_name} table...', end='')
        actual_count = spark.sql(f"select count(*) from {self.catalog}.{self.schema}.{table_name} where {filter}").collect()[0][0]
        assert actual_count==expected_count, f"Test case failed, actual count is {actual_count}"
        print('Test Passed.')
    
    def testcases(self):
        self.assert_fn('bz_registered_users', 'true', 5)
        self.assert_fn('bz_gym_logins', 'true', 8)
        self.assert_fn('bz_kafka_multiplex', "topic='user_info'", 4)
        self.assert_fn('bz_kafka_multiplex', "topic='bpm'", 5)
        self.assert_fn('bz_kafka_multiplex', "topic='workout'", 2)

#obj = BronzeTestSuite()
#obj.testcases()