In [1]:
run_env = 'on_cloud'
# 'on_cloud' / 'on_premise'

In [2]:
#Importing spark session and spark configurations
from pyspark.sql import SparkSession
def _get_spark():
  spark = SparkSession.builder.appName("project_customer_360_ops_report").getOrCreate()
  spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
  spark.conf.set("spark.sql.parquet.binaryAsString", "true")
  spark.conf.set("spark.sql.shuffle.partitions", 200)
  spark.conf.set("spark.sql.files.maxPartitionBytes", 1024*1024*256)
  return spark
spark = _get_spark()
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

In [3]:
#importing required spark functions
from pyspark.sql import functions as f
from pyspark.sql.functions import expr, col

In [4]:
#Reading the latest metadata table and filtering the unwanted datasets /layers
if run_env == 'on_cloud':
    mtdt_tbl = spark.read.parquet('/mnt/customer360-blob-output/C360/UTILITIES/metadata_table')
else:
    mtdt_tbl = spark.read.parquet('/projects/prod/c360/data/UTILITIES/metadata_table')
ops_report = mtdt_tbl.where("table_name not in ('int_l0_streaming_vimmi_table')")

In [5]:
#Finding the latest record of every dataset based on updated_on and target_max_data_load_date column.
ops_report = ops_report.withColumn("rn", expr("row_number() over (partition by table_name order by updated_on desc,target_max_data_load_date desc)") )
ops_report = ops_report.filter("rn = 1").drop("rn")

In [6]:
#Calculate the Domain name and Feature layer from dataset path
ops_report = ops_report.withColumn("Domain_Name", f.split(f.col("table_path"),'/')[4]) \
                      .withColumn("Feature_Layer",f.split(f.col("table_path"),'/')[5]) \
                      .withColumnRenamed("table_name", "Dataset_Name") \
                      .withColumnRenamed("updated_on", "Last_Data_Refresh_Date")

In [7]:
#Calculate all the relevant columns of ops report.
ops_report = ops_report.withColumn("Days_Since_Last_Refresh", f.datediff(f.current_date(), f.col("Last_Data_Refresh_Date") ) ) \
                       .withColumnRenamed("target_max_data_load_date", "Latest_Data_Partition_Available") \
                       .withColumn("Data_Refresh_Freq", f.when( f.lower(f.col("Feature_Layer")) == 'l1_features', "daily").when( f.lower(f.col("Feature_Layer")) == 'l2_features', "weekly").when(f.lower(f.col("Feature_Layer")) == 'l3_features', "monthly").when( (f.lower(f.col("Feature_Layer")) == 'l4_features') & (f.col("target_layer").like("%l4_daily%")) , "daily" ).when( (f.lower(f.col("Feature_Layer")) == 'l4_features') & (f.col("target_layer").like("%l4_weekly%")) , "weekly").when( (f.lower(f.col("Feature_Layer")) == 'l4_features') & (f.col("target_layer").like("%l4_monthly%")) , "monthly").otherwise("No_frequency_defined") ) \
                        .withColumn("Data_Latency", f.when( (f.col("Data_Refresh_Freq") == 'daily'),f.datediff(f.current_date(), f.col("Latest_Data_Partition_Available")) ).when( (f.col("Data_Refresh_Freq") == 'weekly'), f.datediff(f.current_date(), f.date_sub(f.date_add(f.col("Latest_Data_Partition_Available"),7),1)) ).when( (f.col("Data_Refresh_Freq") == 'monthly'),  f.datediff(f.current_date(), f.date_sub(f.add_months(f.col("Latest_Data_Partition_Available"),1),1))  )) \
                       .withColumn("Is_Data_Refresh_Today", f.when(f.current_date() == f.col("Last_Data_Refresh_Date"),"Y").otherwise("N")) \
                       .withColumnRenamed("table_path", "Dataset_Path") \
                       .withColumn("Need_Supervision", f.when( (f.col("Data_Refresh_Freq") == 'daily') & (f.col("Days_Since_Last_Refresh") > 1) & (f.col("Is_Data_Refresh_Today") == 'N'), "Y").when( (f.col("Data_Refresh_Freq") == 'weekly') & (f.col("Days_Since_Last_Refresh") > 7) & (f.col("Is_Data_Refresh_Today") == 'N'), "Y").when( (f.col("Data_Refresh_Freq") == 'monthly') & (f.col("Days_Since_Last_Refresh") > 31) & (f.col("Is_Data_Refresh_Today") == 'N'), "Y").otherwise("N")  ) \
                      .withColumn("ops_report_updated_date", f.current_date())



In [8]:
ops_report = ops_report.select("Domain_Name","Feature_Layer","Dataset_Name","Last_Data_Refresh_Date","Days_Since_Last_Refresh","Latest_Data_Partition_Available","Data_Latency","Data_Refresh_Freq","Is_Data_Refresh_Today" ,"Need_Supervision", "Dataset_Path","ops_report_updated_date")

if run_env == 'on_cloud':
  ops_report.write.partitionBy("ops_report_updated_date").format("parquet").mode("overwrite").save("/mnt/customer360-blob-output/C360/UTILITIES/c360_ops_report/")
elif run_env == 'on_premise':
  ops_report.write.partitionBy("ops_report_updated_date").format("parquet").mode("overwrite").save("/projects/prod/c360/data/UTILITIES/c360_ops_report/")
else:
  print("Please provide valid run_env value")