# This Notebook is Made to Create Bronze Layer Tables (Ingestion Layer)

## Importing Packages and Config Files

In [0]:
pip install faker

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import pandas as pd
import re
import os
import glob
import logging
import requests
import time
import datetime
import numpy as np
# import pyspark as spark
from pyspark.sql import DataFrame
from pyspark.sql import SQLContext
from pyspark.sql.functions import countDistinct,md5,concat,when,col
from pyspark.sql.types import IntegerType,BooleanType,DateType,StringType,VarcharType,LongType
from pyspark.sql.functions import *
import pandas as pd
import dlt
from pyspark.sql import functions as F
import csv 
from pyspark import SparkContext,SQLContext,SparkConf,StorageLevel

In [0]:
# Getting Variables and Dataframes from config notebook
%run /Workspace/Users/cihangiray.oner@gmail.com/congif.py

## Creating raw data holding DataFrames

In [0]:
from faker import Faker
from faker.providers import DynamicProvider
def create_dataframe(table_name,size=1000):
    '''
    By this function a fake value holding table being created to replicate a table
    that reflects given table structures by task documents.
    '''
    import pandas as pd
    fake = Faker()
                
    #Unique ID dataset provider
    UID_provider = DynamicProvider(
    provider_name="UID_provider",
    elements=[*range(1, 1100, 1)]
    )

    fake.add_provider(UID_provider)
    
    #Random ID dataset provider
    ID_provider = DynamicProvider(
        provider_name="ID_provider",
        elements=[*range(0, 10, 1)]
    )

    fake.add_provider(ID_provider)

    
    
    #ship_name dataset provider
    ship_name_provider = DynamicProvider(
        provider_name="ship_name_provider",
        elements=['frigates','frigates_1','frigates_2','frigates_3','frigates_4']
    )

    fake.add_provider(ship_name_provider)
    
    # Apeend values into a dict by conditions over column names or data types
    col_dataframe_names = {}

    for col_name,data_type in table_name.items():
        if "TIMESTAMP" in data_type:
            col_dataframe_names[col_name] = [fake.date_this_month() for time_stamp in range(size)]
        elif "SESSION_ID" in col_name:
            col_dataframe_names[col_name] = [fake.unique.UID_provider() for ID in range(size)]
        elif "USER_ID" in col_name:
            col_dataframe_names[col_name] = [fake.ID_provider() for ID in range(size)]
        elif "SHIP_NAME" in col_name:
            col_dataframe_names[col_name] = [fake.ship_name_provider() for ID in range(size)]
        elif "BOOLEAN" in data_type or "IS" in col_name:
            col_dataframe_names[col_name] = [fake.boolean() for ID in range(size)]
        elif "NUMBER" in data_type or "ID" in col_name:
            col_dataframe_names[col_name] = [fake.ID_provider() for ID in range(size)]
        elif "COUNTRY" in col_name:
            col_dataframe_names[col_name] = [fake.country() for ID in range(size)]
        else:
            col_dataframe_names[col_name] = [fake.color_name() for ID in range(size)]
    # Geting Column Names
    columns = list(col_dataframe_names.keys())
    # Getting Columns with Values
    data = [[*vals] for vals in zip(*col_dataframe_names.values())]
    # Putting Into a DataFrame
    df_1 = spark.createDataFrame(data, columns)
    return df_1

## Concat Columns Function

In [0]:
def concat_columns(dataframe_name,concat_columns,col_name):
       """
       Concats specified columns into one.

       Args:
        dataframe_name (pyspark.sql.DataFrame): The DataFrame to concatenate.
        concat_columns (list): The column name to concatenate within each group.
        col_name (str): Column name that holds concat values

        Returns:
        dataframe_name: The DataFrame after concat the columns.

       Example:
        df
        +---+------+
        | id| label|
        +---+------+
        |  1|Label1|
        |  2|Label2|
        +---+------+
        concat_columns = ["col1","col2"]
        
        Result:
        +---+---------------+
        | id|col_name       |
        +---+---------------+
        |  1|1Label1        |
        |  2|2Label2        |
        +---+---------------+
       """
       dataframe_name = dataframe_name.withColumn(col_name,(concat(*concat_columns)))
       return dataframe_name



## Writing Dataframes into Hive Metastore Bronze Layer Schema

In [0]:
def create_schema(first_layer_schema_name,location_name):
    """
    Checks if a schema exists in the spark catalog.

    Args:
        location_name (str): catalog name to save schema on
        first_layer_schema_name (str): The name of the schema to check.

    Returns:
        DataFrame: Empty DF creates schema on defined catalog if not exits
    """
    return spark.sql(f"CREATE SCHEMA IF NOT EXISTS  {location_name}.{first_layer_schema_name}")


def check_if_table_exists(first_layer_schema_name, table_name):
    """
    Checks if a table exists in the spark catalog.

    Args:
        table_name (str): The name of the table to check.

    Returns:
        bool: True if the table exists, False otherwise.
    """
    return spark.catalog.tableExists(f"hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")

def write_to_managed_table(df, table_name, first_layer_schema_name, location_name, mode = "overwrite"):
    """
    Writes a DataFrame to a managed table in Delta Lake.

    If the table exists and mode is overwrite, it performs an overwrite operation.
    Otherwise, it either creates a new table or appends transactions to table based on the `mode` parameter.

    Args:
        df (pyspark.sql.DataFrame): The DataFrame to write to the table.
        table_name (str): The name of the target table.
        first_layer_schema_name (str): The schema name of target table.
        location_name (str): Catalog name to save schema on
        mode (str, optional): The write mode.
    """
    #create schema if not exists
    create_schema(first_layer_schema_name,location_name)
    # check if the table exists
    if check_if_table_exists(first_layer_schema_name, table_name):
        print(f"Table exists on hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")
        if mode == "overwrite":
            print(f"Overwriting all transactions on managed table hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")
            df.write.format("delta").option("delta.columnMapping.mode", "name").mode(mode).saveAsTable(f"hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")
        else:
            print(f"Appending all transactions on managed table hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")
            df.write.format("delta").option("delta.columnMapping.mode", "name").mode(mode).saveAsTable(f"hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")
    else:
        print(f"Writing to managed table hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")
        df.write.format("delta").option("delta.columnMapping.mode", "name").saveAsTable(f"hive_metastore.{first_layer_schema_name}.{table_name}_bronze_layer_managed_table")



In [0]:
for key,value in table_names_list.items():
    print(key)
    # Creating dataframes by create_dataframe function
    vars() [key] = create_dataframe(value)
    vars() [key] = concat_columns(vars() [key],["SESSION_ID","USER_ID"],"Join_key")
    # Writing dataframes to 1st layer on defined schema and table
    write_to_managed_table(vars()[key], key,  first_layer_schema_name, location_name)

