# This Notebook is Made to Create Silver Layer Tables from Bronze Layer Tables as Cleaned and Normalized Tables 

## Importing Packages and Config Files

In [0]:
import logging
import time
import datetime
import json
import re
import requests
from pyspark.sql import SQLContext
from collections import defaultdict
from pyspark.sql.functions import countDistinct
from pyspark.sql.types import IntegerType,BooleanType,DateType,StringType,VarcharType,LongType,MapType,ArrayType,StructType,TimestampType,TimestampNTZType,DataType
from pyspark.sql.functions import *
from pyspark.sql import functions as F, Window,udf
from pyspark.sql.functions import concat,when,md5
from pyspark.sql.functions import explode, col
from pyspark.sql import Row
import pyspark.sql.types as T
from pyspark.sql import DataFrame

In [0]:
# Getting Variables and Dataframes from config notebook
%run /Workspace/Users/cihangiray.oner@gmail.com/congif.py

### Reading Tables from Bronze Layer (Only Getting Needed Columns)

In [0]:
def check_if_table_exists(read_table_name,read_schema_name):
    """
    Checks if a table exists in the spark catalog.

    Args:
        table_name (str): The name of the table to check.

    Returns:
        bool: True if the table exists, False otherwise.
    """
    return spark.catalog.tableExists(f"hive_metastore.{read_schema_name}.{read_table_name}")


def read_table(dataframe_name,schema_name,table_name,columns_to_read,filter_value = "N/A",fill_value = "N/A"):
    """
    Reads table and assign it to a dataframe from databricks schemas.

    Args:
        schema_name (str): The name of the schema to read table from.
        table_name (str): The name of the table to read data from.
        columns_to_read (list): List containing column names from table to be read
        fill_value (str): String value to be used in null value conversion
        filter_value (str): Filter parameter for reading table with filtering by spesific values

    Returns:
        spark.DataFrame: DataFrame after reading table gets into DataFrame.
    """
    #check if table exists
    if check_if_table_exists(table_name,schema_name):
        dataframe_name = spark.table(f"hive_metastore.{schema_name}.{table_name}").select(columns_to_read).na.fill(f'{fill_value}')
    else:
        print(f"Read table not found hive_metastore.{schema_name}.{table_name}")
    return dataframe_name

In [0]:
for key,value in table_names_list.items():
    print(key)
    # Creating Condition to apply deduplication operations for new_user and login tables on read
    if key == "new_user_df" or key == "login_df":
        vars() [key] = read_table(key,first_layer_schema_name,f"{key}_bronze_layer_managed_table",silver_layer_col_names[key])
        vars() [key] = vars() [key].dropDuplicates(['USER_ID'])
    # Creating dataframes by create_dataframe function
    else:
        vars() [key] = read_table(key,first_layer_schema_name,f"{key}_bronze_layer_managed_table",silver_layer_col_names[key])

in_app_purchase_df
login_df
multiplayer_battle_df
new_user_df
session_started_df
ship_transaction_df


### Creating NEW_USER Dim Table (Deduplicating USER_ID)

### Creating USER_ID Dim Table (Getting Unique User Id`s from Session Started Table to get all Users does not exists on New User Table)

In [0]:
# silver_layer_col_names_1 dict from config.py holding table name and column names
for key,value in silver_layer_col_names_1.items():
    # Selecting needed columns from Session Table
    vars()[key] = session_started_df.select(value)
    # Renaming USER_ID column for smooth write/join operation
    vars()[key] = vars()[key].withColumnRenamed('USER_ID','SESSION_USER_ID').withColumnRenamed('Join_Key','SESSION_Join_Key')
    # Removing USER_IDs belongs to new users and selecting only needed columns
    vars()[key] = vars()[key].join(new_user_df,vars()[key].SESSION_USER_ID == new_user_df.USER_ID,"left_anti").select('SESSION_USER_ID','USER_IS_SPENDER','USER_GEO_LOCATION','SESSION_Join_Key')
    # Getting Unique USER_ID`s by Dropping Duplicates on USER_ID column
    vars()[key] = vars()[key].dropDuplicates(['SESSION_USER_ID'])
    # Getting back to proper naming for further table naming operations
    vars()[key] = vars()[key].withColumnRenamed('SESSION_USER_ID','USER_ID').withColumnRenamed('SESSION_Join_Key','Join_Key')
    


## Writing Dataframes into Hive Metastore Silver Layer Schema

In [0]:
def create_schema(second_layer_schema_name,location_name):
    """
    Checks if a schema exists in the spark catalog.

    Args:
        location_name (str): catalog name to save schema on
        second_layer_schema_name (str): The name of the schema to check.

    Returns:
        DataFrame: Empty DF creates schema on defined catalog if not exits
    """
    return spark.sql(f"CREATE SCHEMA IF NOT EXISTS  {location_name}.{second_layer_schema_name}")


def check_if_table_exists(second_layer_schema_name, table_name):
    """
    Checks if a table exists in the spark catalog.

    Args:
        table_name (str): The name of the table to check.

    Returns:
        bool: True if the table exists, False otherwise.
    """
    return spark.catalog.tableExists(f"hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")

def write_to_managed_table(df, table_name, second_layer_schema_name, location_name, mode = "overwrite"):
    """
    Writes a DataFrame to a managed table in Delta Lake.

    If the table exists and mode is overwrite, it performs an overwrite operation.
    Otherwise, it either creates a new table or appends transactions to table based on the `mode` parameter.

    Args:
        df (pyspark.sql.DataFrame): The DataFrame to write to the table.
        table_name (str): The name of the target table.
        second_layer_schema_name (str): The schema name of target table.
        location_name (str): Catalog name to save schema on
        mode (str, optional): The write mode.
    """
    #create schema if not exists
    create_schema(second_layer_schema_name,location_name)
    # check if the table exists
    if check_if_table_exists(second_layer_schema_name, table_name):
        print(f"Table exists on hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")
        if mode == "overwrite":
            print(f"Overwriting all transactions on managed table hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")
            df.write.format("delta").option("delta.columnMapping.mode", "name").mode(mode).saveAsTable(f"hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")
        else:
            print(f"Appending all transactions on managed table hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")
            df.write.format("delta").option("delta.columnMapping.mode", "name").mode(mode).saveAsTable(f"hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")
    else:
        print(f"Writing to managed table hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")
        df.write.format("delta").option("delta.columnMapping.mode", "name").saveAsTable(f"hive_metastore.{second_layer_schema_name}.{table_name}_silver_layer_managed_table")



In [0]:
# Getting active dataframe names from ongoing session (One-time cache)
def list_dataframes():
    """
    Lists of dataframes names as list.

    Returns:
        list: list consist of names.
    """
    return [k for (k, v) in globals().items() if isinstance(v, DataFrame)]


for df_name in list_dataframes():
    print(df_name)
    # vars() [df_name] = vars() [df_name].drop('SESSION_ID').drop('USER_ID')
    # Writing dataframes to 2nd layer on defined schema and table
    write_to_managed_table(vars()[df_name], df_name,  second_layer_schema_name, location_name)

