In [0]:
#Return Spark session
from pyspark.sql.session import SparkSession
def get_spark_session(app_name="Some Anonymous Data Engineering Project"):
    try:
        spark = SparkSession.getActiveSession()
        if spark:
            return spark
    except:
        pass

    return (SparkSession.builder.appName(app_name).getOrCreate())



In [0]:
def read_file(spark,filetype,path,header=True,infer_schema=True,mline=True):
    if filetype=="csv":
        return spark.read.csv(path,header=header,inferSchema=infer_schema)#read_csv_df(spark,path)
    elif filetype=="json":
        return read_json_df(spark,path)
    elif filetype=="delta":
        return read_delta_df(spark,path)
    elif filetype=='orc':
        return spark.read.orc(path)
    elif filetype=='parquet':
        return spark.read.parquet(path)
    else:
        raise Exception("File type not supported")

In [0]:
def write_file(df, path,format="delta", mode="overwrite"):
    return df.write.mode(mode).format(format).save(path)

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window


def enrich_df(df):
    return(df.withColumn("ingestion_time",current_timestamp()))

def f_cast_col(df,schema_map):
    for cname,dtype in schema_map.items():
        df = df.withColumn(cname, col(cname).cast(dtype))
    return df

# -----------------------------------------------------------------------------------------------------
def deduplicate_with_window(df, partition_cols, order_col=None, order_type="asc"):

    """
    Removes duplicates using window function.

    Parameters:
    df              : Input DataFrame
    partition_cols  : List of columns to identify duplicates
    order_col       : Column to decide which row to keep
    order_type      : 'asc' or 'desc' (default: asc)

    Returns:
    Deduplicated DataFrame
    """

    # Build window specification
    if order_col:
        if order_type == "desc":
            window_spec = Window.partitionBy(*partition_cols).orderBy(col(order_col).desc())
        else:
            window_spec = Window.partitionBy(*partition_cols).orderBy(col(order_col))
    else:
        window_spec = Window.partitionBy(*partition_cols).orderBy(lit(1))

    # Apply row_number
    df_rn = df.withColumn("rn", row_number().over(window_spec))

    # Keep only first record
    df_dedup = df_rn.filter(col("rn") == 1).drop("rn")

    return df_dedup

# -----------------------------------------------------------------------------------------------------
def deduplicate(df, keys):#pass the keys as a list []
    return df.dropDuplicates(keys)

# -----------------------------------------------------------------------------------------------------
def drop_required_nulls(df, columns):#pass the columns as a list []
    return df.dropna(subset=columns)