# Main ETL for Loan Level Data

## Initial set up

In [None]:
import glob
import logging
import sys
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.types import DateType, StringType, DoubleType, BooleanType
import csv
from functools import reduce
import os
from lxml import objectify
import pandas as pd

SPARK = SparkSession.builder.master("local[*]").getOrCreate()
BRONZE_SOURCE_DIR = "../data/SMES_IT_ES_FR"
SILVER_SOURCE_DIR = "../data/output"

### ASSET BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Loan_Data"
    config["ASSET_COLUMNS"] = {
        "AS1": DateType(),
        "AS2": StringType(),
        "AS3": StringType(),
        "AS4": StringType(),
        "AS5": StringType(),
        "AS6": StringType(),
        "AS7": StringType(),
        "AS8": StringType(),
        "AS15": StringType(),
        "AS16": StringType(),
        "AS17": StringType(),
        "AS18": StringType(),
        "AS19": DateType(),
        "AS20": DateType(),
        "AS21": StringType(),
        "AS22": StringType(),
        "AS23": BooleanType(),
        "AS24": StringType(),
        "AS25": StringType(),
        "AS26": StringType(),
        "AS27": DoubleType(),
        "AS28": DoubleType(),
        "AS29": BooleanType(),
        "AS30": DoubleType(),
        "AS31": DateType(),
        "AS32": StringType(),
        "AS33": StringType(),
        "AS34": StringType(),
        "AS35": StringType(),
        "AS36": StringType(),
        "AS37": DoubleType(),
        "AS38": DoubleType(),
        "AS39": DoubleType(),
        "AS40": DoubleType(),
        "AS41": DoubleType(),
        "AS42": StringType(),
        "AS43": StringType(),
        "AS44": DoubleType(),
        "AS45": StringType(),
        "AS50": DateType(),
        "AS51": DateType(),
        "AS52": StringType(),
        "AS53": BooleanType(),
        "AS54": DoubleType(),
        "AS55": DoubleType(),
        "AS56": DoubleType(),
        "AS57": StringType(),
        "AS58": StringType(),
        "AS59": StringType(),
        "AS60": DoubleType(),
        "AS61": DoubleType(),
        "AS62": StringType(),
        "AS63": DoubleType(),
        "AS64": DoubleType(),
        "AS65": StringType(),
        "AS66": DoubleType(),
        "AS67": DateType(),
        "AS68": StringType(),
        "AS69": DoubleType(),
        "AS70": DateType(),
        "AS71": DateType(),
        "AS80": DoubleType(),
        "AS81": DoubleType(),
        "AS82": DoubleType(),
        "AS83": StringType(),
        "AS84": StringType(),
        "AS85": DoubleType(),
        "AS86": DoubleType(),
        "AS87": DateType(),
        "AS88": DoubleType(),
        "AS89": StringType(),
        "AS90": DoubleType(),
        "AS91": DateType(),
        "AS92": StringType(),
        "AS93": DoubleType(),
        "AS94": StringType(),
        "AS100": DoubleType(),
        "AS101": DoubleType(),
        "AS102": DoubleType(),
        "AS103": DoubleType(),
        "AS104": DoubleType(),
        "AS105": DoubleType(),
        "AS106": DoubleType(),
        "AS107": DoubleType(),
        "AS108": DoubleType(),
        "AS109": DoubleType(),
        "AS110": DoubleType(),
        "AS111": StringType(),
        "AS112": DateType(),
        "AS115": DoubleType(),
        "AS116": DoubleType(),
        "AS117": DoubleType(),
        "AS118": DoubleType(),
        "AS119": DoubleType(),
        "AS120": DoubleType(),
        "AS121": BooleanType(),
        "AS122": BooleanType(),
        "AS123": StringType(),
        "AS124": DateType(),
        "AS125": DoubleType(),
        "AS126": DoubleType(),
        "AS127": DateType(),
        "AS128": DoubleType(),
        "AS129": StringType(),
        "AS130": DateType(),
        "AS131": BooleanType(),
        "AS132": DoubleType(),
        "AS133": DateType(),
        "AS134": DateType(),
        "AS135": DoubleType(),
        "AS136": DoubleType(),
        "AS137": DateType(),
        "AS138": DoubleType(),
    }
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: list of desired files from source_dir.
    """
    all_files = [
        f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv") if "Labeled0M" not in f
    ]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark datafram for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            portfolio_id = csv_f.split("/")[-2]
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                elif i == 1:
                    continue
                else:
                    content.append(line)
            df = spark.createDataFrame(content, col_names).withColumn(
                "ID", F.lit(portfolio_id)
            )
            list_dfs.append(df)
    if list_dfs == []:
        print("No dataframes were extracted from files. Exit process!")
        sys.exit(1)
    return reduce(DataFrame.union, list_dfs)


def replace_no_data(df):
    """
    Replace ND values inside the dataframe
    TODO: ND are associated with labels that explain why the vaue is missing.
          Should handle this information better in future releases.
    :param df: Spark dataframe with loan asset data.
    :return df: Spark dataframe without ND values.
    """
    for col_name in df.columns:
        df = df.withColumn(
            col_name,
            F.when(F.col(col_name).startswith("ND"), None).otherwise(F.col(col_name)),
        )
    return df


def replace_bool_data(df):
    """
    Replace Y/N with boolean flags in the dataframe.

    :param df: Spark dataframe with loan asset data.
    :return df: Spark dataframe without Y/N values.
    """
    for col_name in df.columns:
        df = df.withColumn(
            col_name,
            F.when(F.col(col_name) == "Y", "True")
            .when(F.col(col_name) == "N", "False")
            .otherwise(F.col(col_name)),
        )
    return df


def cast_to_datatype(df, columns):
    """
    Cast data to the respective datatype.

    :param df: Spark dataframe with loan asset data.
    :param columns: collection of column names and respective data types.
    :return df: Spark dataframe with correct values.
    """
    for col_name, data_type in columns.items():
        if data_type == BooleanType():
            df = (
                df.withColumn("tmp_col_name", F.col(col_name).contains("True"))
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
        if data_type == DateType():
            df = (
                df.withColumn("tmp_col_name", F.to_date(F.col(col_name)))
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
        if data_type == DoubleType():
            df = (
                df.withColumn(
                    "tmp_col_name", F.round(F.col(col_name).cast(DoubleType()), 2)
                )
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
    df = (
        df.withColumn("year", F.year(F.col("AS1")))
        .withColumn("month", F.month(F.col("AS1")))
        .withColumn("day", F.dayofmonth(F.col("AS1")))
    )
    return df


print("Start ASSETS BRONZE job.")
run_props = set_job_params()
all_asset_files = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
print(f"Retrieved {len(all_asset_files)} asset data files.")
raw_asset_df = create_dataframe(SPARK, all_asset_files)
print("Remove ND values.")
tmp_df1 = replace_no_data(raw_asset_df)
print("Replace Y/N with boolean flags.")
tmp_df2 = replace_bool_data(tmp_df1)
print("Cast data to correct types.")
final_df = cast_to_datatype(tmp_df2, run_props["ASSET_COLUMNS"])
(
    final_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("append")
    .save("../dataoutput/bronze/assets.parquet")
)

### COLLATERAL BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Collateral"
    config["COLLATERAL_COLUMNS"] = {
        "CS1": StringType(),
        "CS2": StringType(),
        "CS3": StringType(),
        "CS4": DoubleType(),
        "CS5": DoubleType(),
        "CS6": StringType(),
        "CS7": BooleanType(),
        "CS8": BooleanType(),
        "CS9": BooleanType(),
        "CS10": DoubleType(),
        "CS11": DateType(),
        "CS12": DateType(),
        "CS13": StringType(),
        "CS14": StringType(),
        "CS15": DoubleType(),
        "CS16": StringType(),
        "CS17": StringType(),
        "CS18": DoubleType(),
        "CS19": DoubleType(),
        "CS20": StringType(),
        "CS21": DoubleType(),
        "CS22": DateType(),
        "CS23": StringType(),
        "CS24": StringType(),
        "CS25": StringType(),
        "CS26": StringType(),
        "CS27": StringType(),
        "CS28": DoubleType(),
    }
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark datafram for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            portfolio_id = csv_f.split("/")[-2]
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                elif i == 1:
                    continue
                else:
                    content.append(line)
            df = spark.createDataFrame(content, col_names).withColumn(
                "ID", F.lit(portfolio_id)
            )
            list_dfs.append(df)
    if list_dfs == []:
        print("No dataframes were extracted from files. Exit process!")
        sys.exit(1)
    return reduce(DataFrame.union, list_dfs)


def replace_no_data(df):
    """
    Replace ND values inside the dataframe
    TODO: ND are associated with labels that explain why the vaue is missing.
          Should handle this information better in future releases.
    :param df: Spark dataframe with loan asset data.
    :return df: Spark dataframe without ND values.
    """
    for col_name in df.columns:
        df = df.withColumn(
            col_name,
            F.when(F.col(col_name).startswith("ND"), None).otherwise(F.col(col_name)),
        )
    return df


def replace_bool_data(df):
    """
    Replace Y/N with boolean flags in the dataframe.

    :param df: Spark dataframe with loan asset data.
    :return df: Spark dataframe without Y/N values.
    """
    for col_name in df.columns:
        df = df.withColumn(
            col_name,
            F.when(F.col(col_name) == "Y", "True")
            .when(F.col(col_name) == "N", "False")
            .otherwise(F.col(col_name)),
        )
    return df


def cast_to_datatype(df, columns):
    """
    Cast data to the respective datatype.

    :param df: Spark dataframe with loan asset data.
    :param columns: collection of column names and respective data types.
    :return df: Spark dataframe with correct values.
    """
    for col_name, data_type in columns.items():
        if data_type == BooleanType():
            df = (
                df.withColumn("tmp_col_name", F.col(col_name).contains("True"))
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
        if data_type == DateType():
            df = (
                df.withColumn("tmp_col_name", F.to_date(F.col(col_name)))
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
        if data_type == DoubleType():
            df = (
                df.withColumn(
                    "tmp_col_name", F.round(F.col(col_name).cast(DoubleType()), 2)
                )
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
    df = (
        df.withColumn("year", F.year(F.col("AS1")))
        .withColumn("month", F.month(F.col("AS1")))
        .withColumn("day", F.dayofmonth(F.col("AS1")))
        .drop("AS1")
    )
    return df


print("Start COLLATERAL BRONZE job.")
run_props = set_job_params()
all_collateral_files = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
print(f"Retrieved {len(all_collateral_files)} collateral data files.")
tmp_raw_collateral_df = create_dataframe(SPARK, all_collateral_files)
try:
    assets_bronze_df = (
        SPARK
        .read.parquet(f'{run_props["SOURCE_DIR"]}/bronze/assets.parquet')
        .select("AS1", "AS3")
        .withColumnRenamed("AS3", "CS2")
    )
    raw_collateral_df = tmp_raw_collateral_df.join(
        assets_bronze_df, on="CS2", how="inner"
    )
except Exception as e:
    print("No bronze asset dataframe found. Exit process!")
    sys.exit(1)
print("Remove ND values.")
tmp_df1 = replace_no_data(raw_collateral_df)
print("Replace Y/N with boolean flags.")
tmp_df2 = replace_bool_data(tmp_df1)
print("Cast data to correct types.")
final_df = cast_to_datatype(tmp_df2, run_props["COLLATERAL_COLUMNS"])
(
    final_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("append")
    .save("../dataoutput/bronze/collaterals.parquet")
)

### BOND INFO BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Bond_Info"
    config["BOND_COLUMNS"] = {
        "BS1": DateType(),
        "BS2": StringType(),
        "BS3": DoubleType(),
        "BS4": DoubleType(),
        "BS5": BooleanType(),
        "BS6": StringType(),
        "BS11": DoubleType(),
        "BS12": BooleanType(),
        "BS13": DoubleType(),
        "BS19": StringType(),
        "BS20": StringType(),
    }
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark datafram for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            portfolio_id = csv_f.split("/")[-2]
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                elif i == 1:
                    continue
                else:
                    content.append(line)
            df = spark.createDataFrame(content, col_names).withColumn(
                "ID", F.lit(portfolio_id)
            )
            list_dfs.append(df)
    if list_dfs == []:
        print("No dataframes were extracted from files. Exit process!")
        sys.exit(1)
    return reduce(DataFrame.union, list_dfs)


def replace_no_data(df):
    """
    Replace ND values inside the dataframe
    TODO: ND are associated with labels that explain why the vaue is missing.
          Should handle this information better in future releases.
    :param df: Spark dataframe with loan asset data.
    :return df: Spark dataframe without ND values.
    """
    for col_name in df.columns:
        df = df.withColumn(
            col_name,
            F.when(F.col(col_name).startswith("ND"), None).otherwise(F.col(col_name)),
        )
    return df


def replace_bool_data(df):
    """
    Replace Y/N with boolean flags in the dataframe.

    :param df: Spark dataframe with loan asset data.
    :return df: Spark dataframe without Y/N values.
    """
    for col_name in df.columns:
        df = df.withColumn(
            col_name,
            F.when(F.col(col_name) == "Y", "True")
            .when(F.col(col_name) == "N", "False")
            .otherwise(F.col(col_name)),
        )
    return df


def cast_to_datatype(df, columns):
    """
    Cast data to the respective datatype.

    :param df: Spark dataframe with loan asset data.
    :param columns: collection of column names and respective data types.
    :return df: Spark dataframe with correct values.
    """
    for col_name, data_type in columns.items():
        if data_type == BooleanType():
            df = (
                df.withColumn("tmp_col_name", F.col(col_name).contains("True"))
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
        if data_type == DateType():
            df = (
                df.withColumn("tmp_col_name", F.to_date(F.col(col_name)))
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
        if data_type == DoubleType():
            df = (
                df.withColumn(
                    "tmp_col_name", F.round(F.col(col_name).cast(DoubleType()), 2)
                )
                .drop(col_name)
                .withColumnRenamed("tmp_col_name", col_name)
            )
    df = (
        df.withColumn("year", F.year(F.col("BS1")))
        .withColumn("month", F.month(F.col("BS1")))
        .withColumn("day", F.dayofmonth(F.col("BS1")))
    )
    return df


print("Start BOND INFO BRONZE job.")
run_props = set_job_params()
all_bond_info_files = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
print(f"Retrieved {len(all_bond_info_files)} bond info data files.")
raw_bond_info_df = create_dataframe(SPARK, all_bond_info_files)
print("Remove ND values.")
tmp_df1 = replace_no_data(raw_bond_info_df)
print("Replace Y/N with boolean flags.")
tmp_df2 = replace_bool_data(tmp_df1)
print("Cast data to correct types.")
final_df = cast_to_datatype(tmp_df2, run_props["BOND_COLUMNS"])
(
    final_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("append")
    .save("../dataoutput/bronze/bond_info.parquet")
)

### AMORTISATION BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Amortization"
    config["AMORTISATION_COLUMNS"] = {
        "AS3": StringType(),
        "AS150": DoubleType(),
        "AS151": DateType(),
        "AS1348": DoubleType(),
        "AS1349": DateType(),
    }
    for i in range(152, 1348):
        if i % 2 == 0:
            config["BOND_COLUMNS"][f"AS{i}"] = DoubleType()
        else:
            config["BOND_COLUMNS"][f"AS{i}"] = DateType()
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark datafram for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            portfolio_id = csv_f.split("/")[-2]
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                elif i == 1:
                    continue
                else:
                    content.append(
                        list(filter(None, [None if x == "" else x for x in line]))
                    )
            df = spark.createDataFrame(content, col_names).withColumn(
                "ID", F.lit(portfolio_id)
            )
            list_dfs.append(df)
    if list_dfs == []:
        print("No dataframes were extracted from files. Exit process!")
        sys.exit(1)
    return reduce(DataFrame.union, list_dfs)


def _melt(df, id_vars, value_vars, var_name="FEATURE_NAME", value_name="FEATURE_VALUE"):
    """Convert DataFrame from wide to long format."""
    # Ref:https://stackoverflow.com/a/41673644
    # Create array<struct<variable: str, value: ...>>
    _vars_and_vals = F.array(
        *(
            F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name))
            for c in value_vars
        )
    )
    # Add to the DataFrame and explode
    _tmp = df.withColumn("_vars_and_vals", F.explode(_vars_and_vals))
    cols = id_vars + [
        F.col("_vars_and_vals")[x].cast("string").alias(x)
        for x in [var_name, value_name]
    ]
    return _tmp.select(*cols)


def unpivot_dataframe(df, columns):
    """
    Convert dataframe from wide to long table.

    :param df: raw Spark dataframe.
    :param columns: data columns with respective datatype.
    :return new_df: unpivot Spark dataframe.
    """
    date_columns = [
        k for k, v in columns.items() if v == DateType() and k in df.columns
    ]
    double_columns = [
        k for k, v in columns.items() if v == DoubleType() and k in df.columns
    ]

    date_df = _melt(
        df,
        id_vars=["AS3"],
        value_vars=date_columns,
        var_name="DATE_COLUMNS",
        value_name="DATE_VALUE",
    )
    double_df = _melt(
        df,
        id_vars=["AS3"],
        value_vars=double_columns,
        var_name="DOUBLE_COLUMNS",
        value_name="DOUBLE_VALUE",
    )
    new_df = date_df.join(double_df, on="AS3", how="inner")
    return new_df


print("Start AMORTISATION BRONZE job.")
run_props = set_job_params()
all_amortisation_files = get_raw_files(
    run_props["SOURCE_DIR"], run_props["FILE_KEY"]
)
print(f"Retrieved {len(all_amortisation_files)} amortisation data files.")
raw_amortisation_df = create_dataframe(SPARK, all_amortisation_files)
print("Unpivot amortisation dataframe.")
unpivot_df = unpivot_dataframe(
    raw_amortisation_df, run_props["AMORTISATION_COLUMNS"]
)
print("Cast data to correct types.")
final_df = (
    unpivot_df.withColumn("tmp_col_name", F.to_date(F.col("DATE_VALUE")))
    .drop("DATE_VALUE")
    .withColumnRenamed("tmp_col_name", "DATE_VALUE")
    .withColumn(
        "tmp_col_name", F.round(F.col("DOUBLE_VALUE").cast(DoubleType()), 2)
    )
    .drop("DOUBLE_VALUE")
    .withColumnRenamed("tmp_col_name", "DOUBLE_VALUE")
    .withColumn("year", F.year(F.col("DATE_VALUE")))
    .withColumn("month", F.month(F.col("DATE_VALUE")))
    .withColumn("day", F.dayofmonth(F.col("DATE_VALUE")))
)
(
    final_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("append")
    .save("../dataoutput/bronze/amortisation.parquet")
)

### DEAL DETAILS BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Deal_Details"
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/{file_key}/*.xml")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files[0]


def create_dataframe(deal_detail_file):
    """
    Read files and generate one PySpark DataFrame from them.

    :param deal_detail_file: file to be read to generate the dataframe.
    :return df: PySpark datafram for loan asset data.
    """
    xml_data = objectify.parse(deal_detail_file)  # Parse XML data
    root = xml_data.getroot()  # Root element

    data = []
    cols = []
    for i in range(
        len(
            root.getchildren()[1]
            .getchildren()[0]
            .getchildren()[1]
            .getchildren()[0]
            .getchildren()
        )
    ):
        child = (
            root.getchildren()[1]
            .getchildren()[0]
            .getchildren()[1]
            .getchildren()[0]
            .getchildren()[i]
        )
        data.append(child.text)
        cols.append(child.tag.replace("{http://edwin.eurodw.eu/EDServices/2.3}", ""))

    df = pd.DataFrame(data).T  # Create DataFrame and transpose it
    df.columns = cols  # Update column names
    return df


print("Start DEAL DETAILS BRONZE job.")
run_props = set_job_params()
xml_file = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
final_df = create_dataframe(xml_file)
(
    final_df.format("parquet")
    .mode("append")
    .save("../dataoutput/bronze/deal_details/info.parquet")
)

### ASSET SILVER

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = SILVER_SOURCE_DIR
    config["DATE_COLUMNS"] = [
        "AS1",
        "AS19",
        "AS20",
        "AS31",
        "AS50",
        "AS51",
        "AS67",
        "AS70",
        "AS71",
        "AS87",
        "AS91",
        "AS112",
        "AS124",
        "AS127",
        "AS130",
        "AS133",
        "AS134",
        "AS137",
    ]
    return config


def get_columns_collection(df):
    """
    Get collection of dataframe columns divided by topic.

    :param df: Asset bronze Spark dataframe.
    :return cols_dict: collection of columns labelled by topic.
    """
    cols_dict = {
        "general": ["ID", "year", "month", "day"]
        + [f"AS{i}" for i in range(1, 15) if f"AS{i}" in df.columns],
        "obligor_info": [f"AS{i}" for i in range(15, 50) if f"AS{i}" in df.columns],
        "loan_info": [f"AS{i}" for i in range(50, 80) if f"AS{i}" in df.columns],
        "interest_rate": [f"AS{i}" for i in range(80, 100) if f"AS{i}" in df.columns],
        "financial_info": [f"AS{i}" for i in range(100, 115) if f"AS{i}" in df.columns],
        "performance_info": [
            f"AS{i}" for i in range(115, 146) if f"AS{i}" in df.columns
        ],
    }
    return cols_dict


def process_dates(df, date_cols_list):
    """
    Extract dates dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param date_cols_list: list of date columns.
    :return new_df: silver type Spark dataframe.
    """
    date_cols = [c for c in date_cols_list if c in df.columns]

    new_df = (
        df.select(F.explode(F.array(date_cols)).alias("date_col"))
        .dropDuplicates()
        .withColumn("unix_date", F.unix_timestamp(F.col("date_col")))
        .withColumn("year", F.year(F.col("date_col")))
        .withColumn("month", F.month(F.col("date_col")))
        .withColumn("quarter", F.quarter(F.col("date_col")))
        .withColumn("WoY", F.weekofyear(F.col("date_col")))
        .withColumn("day", F.dayofmonth(F.col("date_col")))
    )
    return new_df


def process_obligor_info(df, cols_dict):
    """
    Extract obligor info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(cols_dict["general"] + cols_dict["obligor_info"])
        .withColumn("tmp_AS1", F.unix_timestamp(F.col("AS1")))
        .drop("AS1")
        .withColumnRenamed("tmp_AS1", "AS1")
        .withColumn("tmp_AS19", F.unix_timestamp(F.col("AS19")))
        .drop("AS19")
        .withColumnRenamed("tmp_AS19", "AS19")
        .withColumn("tmp_AS20", F.unix_timestamp(F.col("AS20")))
        .drop("AS20")
        .withColumnRenamed("tmp_AS20", "AS20")
        .withColumn("tmp_AS31", F.unix_timestamp(F.col("AS31")))
        .drop("AS31")
        .withColumnRenamed("tmp_AS31", "AS31")
    )
    return new_df


def process_loan_info(df, cols_dict):
    """
    Extract loan info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(cols_dict["general"] + cols_dict["loan_info"])
        .withColumn("tmp_AS1", F.unix_timestamp(F.col("AS1")))
        .drop("AS1")
        .withColumnRenamed("tmp_AS1", "AS1")
        .withColumn("tmp_AS50", F.unix_timestamp(F.col("AS50")))
        .drop("AS50")
        .withColumnRenamed("tmp_AS50", "AS50")
        .withColumn("tmp_AS51", F.unix_timestamp(F.col("AS51")))
        .drop("AS51")
        .withColumnRenamed("tmp_AS51", "AS51")
        .withColumn("tmp_AS67", F.unix_timestamp(F.col("AS67")))
        .drop("AS67")
        .withColumnRenamed("tmp_AS67", "AS67")
        .withColumn("tmp_AS70", F.unix_timestamp(F.col("AS70")))
        .drop("AS70")
        .withColumnRenamed("tmp_AS70", "AS70")
        .withColumn("tmp_AS71", F.unix_timestamp(F.col("AS71")))
        .drop("AS71")
        .withColumnRenamed("tmp_AS71", "AS71")
    )
    return new_df


def process_interest_rate(df, cols_dict):
    """
    Extract interest rate dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(cols_dict["general"] + cols_dict["interest_rate"])
        .withColumn("tmp_AS1", F.unix_timestamp(F.col("AS1")))
        .drop("AS1")
        .withColumnRenamed("tmp_AS1", "AS1")
        .withColumn("tmp_AS87", F.unix_timestamp(F.col("AS87")))
        .drop("AS87")
        .withColumnRenamed("tmp_AS87", "AS87")
        .withColumn("tmp_AS91", F.unix_timestamp(F.col("AS91")))
        .drop("AS91")
        .withColumnRenamed("tmp_AS91", "AS91")
    )
    return new_df


def process_financial_info(df, cols_dict):
    """
    Extract financial info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(cols_dict["general"] + cols_dict["financial_info"])
        .withColumn("tmp_AS1", F.unix_timestamp(F.col("AS1")))
        .drop("AS1")
        .withColumnRenamed("tmp_AS1", "AS1")
        .withColumn("tmp_AS112", F.unix_timestamp(F.col("AS112")))
        .drop("AS112")
        .withColumnRenamed("tmp_AS112", "AS112")
    )
    return new_df


def process_performance_info(df, cols_dict):
    """
    Extract performance info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(cols_dict["general"] + cols_dict["performance_info"])
        .withColumn("tmp_AS1", F.unix_timestamp(F.col("AS1")))
        .drop("AS1")
        .withColumnRenamed("tmp_AS1", "AS1")
        .withColumn("tmp_AS124", F.unix_timestamp(F.col("AS124")))
        .drop("AS124")
        .withColumnRenamed("tmp_AS124", "AS124")
        .withColumn("tmp_AS127", F.unix_timestamp(F.col("AS127")))
        .drop("AS127")
        .withColumnRenamed("tmp_AS127", "AS127")
        .withColumn("tmp_AS130", F.unix_timestamp(F.col("AS130")))
        .drop("AS130")
        .withColumnRenamed("tmp_AS130", "AS130")
        .withColumn("tmp_AS133", F.unix_timestamp(F.col("AS133")))
        .drop("AS133")
        .withColumnRenamed("tmp_AS133", "AS133")
        .withColumn("tmp_AS134", F.unix_timestamp(F.col("AS134")))
        .drop("AS134")
        .withColumnRenamed("tmp_AS134", "AS134")
        .withColumn("tmp_AS137", F.unix_timestamp(F.col("AS137")))
        .drop("AS137")
        .withColumnRenamed("tmp_AS137", "AS137")
    )
    return new_df


run_props = set_job_params()
bronze_df = SPARK.read.parquet(
    f'{run_props["SOURCE_DIR"]}/bronze/assets.parquet'
)
assets_columns = get_columns_collection(bronze_df)
print("Generate time dataframe")
date_df = process_dates(bronze_df, run_props["DATE_COLUMNS"])
print("Generate obligor info dataframe")
obligor_info_df = process_obligor_info(bronze_df, assets_columns)
print("Generate loan info dataframe")
loan_info_df = process_loan_info(bronze_df, assets_columns)
print("Generate interest rate dataframe")
interest_rate_df = process_interest_rate(bronze_df, assets_columns)
print("Generate financial info dataframe")
financial_info_df = process_financial_info(bronze_df, assets_columns)
print("Generate performace info dataframe")
performance_info_df = process_performance_info(bronze_df, assets_columns)

print("Write dataframe")

(
    date_df.format("parquet")
    .mode("overwrite")
    .save("../dataoutput/silver/assets/date_table.parquet")
)
(
    loan_info_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/assets/loan_info_table.parquet")
)
(
    obligor_info_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/assets/obligor_info_table.parquet")
)
(
    financial_info_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/assets/financial_info_table.parquet")
)
(
    interest_rate_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/assets/interest_rate_table.parquet")
)
(
    performance_info_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/assets/performance_info_table.parquet")
)

### AMORTISATION SILVER

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = SILVER_SOURCE_DIR
    return config


def process_dates(df):
    """
    Extract dates dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select("DATE_VALUE")
        .alias("date_col")
        .dropDuplicates()
        .withColumn("unix_date", F.unix_timestamp(F.col("date_col")))
        .withColumn("year", F.year(F.col("date_col")))
        .withColumn("month", F.month(F.col("date_col")))
        .withColumn("quarter", F.quarter(F.col("date_col")))
        .withColumn("WoY", F.weekofyear(F.col("date_col")))
        .withColumn("day", F.dayofmonth(F.col("date_col")))
    )
    return new_df


def process_info(df):
    """
    Extract amortisation values dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :return new_df: silver type Spark dataframe.
    """
    new_df = df.withColumn(
        "tmp_DATE_VALUE", F.unix_timestamp(F.col("DATE_VALUE"))
    ).drop("DATE_VALUE")
    return new_df


print("Start AMORTISATION SILVER job.")
run_props = set_job_params()
bronze_df = SPARK.read.parquet(
    f'{run_props["SOURCE_DIR"]}/bronze/amortisation.parquet'
)
print("Generate time dataframe")
date_df = process_dates(bronze_df, run_props["DATE_COLUMNS"])
print("Generate info dataframe")
info_df = process_info(bronze_df)
print("Write dataframe")

(
    date_df.format("parquet")
    .mode("overwrite")
    .save("../dataoutput/silver/amortisation/date_table.parquet")
)
(
    info_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/amortisation/info_table.parquet")
)


### BOND INFO SILVER

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = SILVER_SOURCE_DIR
    config["DATE_COLUMNS"] = ["BS1"]
    return config


def get_columns_collection(df):
    """
    Get collection of dataframe columns divided by topic.

    :param df: Asset bronze Spark dataframe.
    :return cols_dict: collection of columns labelled by topic.
    """
    cols_dict = {
        "bond_info": ["ID", "year", "month", "day"]
        + [f"BS{i}" for i in range(1, 11) if f"BS{i}" in df.columns],
        "collateral_info": ["ID", "year", "month", "day"]
        + [f"BS{i}" for i in range(11, 19) if f"BS{i}" in df.columns],
        "contact_info": ["ID", "year", "month", "day"]
        + [f"BS{i}" for i in range(19, 25) if f"BS{i}" in df.columns],
    }
    return cols_dict


def process_dates(df, col_types_dict):
    """
    Extract dates dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param col_types_dict: collection of columns and their types.
    :return new_df: silver type Spark dataframe.
    """
    date_cols = [c for c in col_types_dict["date"] if c in df.columns]

    new_df = (
        df.select(F.explode(F.array(date_cols)).alias("date_col"))
        .dropDuplicates()
        .withColumn("unix_date", F.unix_timestamp(F.col("date_col")))
        .withColumn("year", F.year(F.col("date_col")))
        .withColumn("month", F.month(F.col("date_col")))
        .withColumn("quarter", F.quarter(F.col("date_col")))
        .withColumn("WoY", F.weekofyear(F.col("date_col")))
        .withColumn("day", F.dayofmonth(F.col("date_col")))
    )
    return new_df


def process_bond_info(df, cols_dict):
    """
    Extract bond info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(cols_dict["bond_info"])
        .withColumn(
            "tmp_BS1", F.unix_timestamp(F.to_timestamp(F.col("BS1"), "yyyy-MM-dd"))
        )
        .drop("BS1")
        .withColumnRenamed("tmp_BS1", "BS1")
    )
    return new_df


def process_collateral_info(df, cols_dict):
    """
    Extract collateral info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(["BS1", "BS2"] + cols_dict["collateral_info"])
        .withColumn(
            "tmp_BS1", F.unix_timestamp(F.to_timestamp(F.col("BS1"), "yyyy-MM-dd"))
        )
        .drop("BS1")
        .withColumnRenamed("tmp_BS1", "BS1")
    )
    return new_df


def process_contact_info(df, cols_dict):
    """
    Extract contact info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param cols_dict: collection of columns labelled by their topic.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.select(["BS1", "BS2"] + cols_dict["contact_info"])
        .withColumn(
            "tmp_BS1", F.unix_timestamp(F.to_timestamp(F.col("BS1"), "yyyy-MM-dd"))
        )
        .drop("BS1")
        .withColumnRenamed("tmp_BS1", "BS1")
    )
    return new_df


print("Start BOND INFO SILVER job.")
run_props = set_job_params()
bronze_df = SPARK.read.parquet(
    f'{run_props["SOURCE_DIR"]}/bronze/bond_info.parquet'
)
bond_info_columns = get_columns_collection(bronze_df)
print("Generate time dataframe")
date_df = process_dates(bronze_df, run_props["DATE_COLUMNS"])
print("Generate bond info dataframe")
info_df = process_bond_info(bronze_df, bond_info_columns)
print("Generate collateral info dataframe")
collateral_df = process_collateral_info(bronze_df, bond_info_columns)
print("Generate contact info dataframe")
contact_df = process_contact_info(bronze_df, bond_info_columns)

print("Write dataframe")

(
    date_df.format("parquet")
    .mode("overwrite")
    .save("../dataoutput/silver/bond_info/date_table.parquet")
)
(
    info_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/bond_info/info_table.parquet")
)
(
    collateral_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/bond_info/collaterals_table.parquet")
)
(
    contact_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/bond_info/contacts_table.parquet")
)


### COLLATERAL SILVER

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = SILVER_SOURCE_DIR
    config["DATE_COLUMNS"] = ["CS11", "CS12", "CS22"]
    return config


def process_dates(df, date_cols_list):
    """
    Extract dates dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :param date_cols_list: list of date columns.
    :return new_df: silver type Spark dataframe.
    """
    date_cols = [c for c in date_cols_list if c in df.columns]

    new_df = (
        df.select(F.explode(F.array(date_cols)).alias("date_col"))
        .dropDuplicates()
        .withColumn("unix_date", F.unix_timestamp(F.col("date_col")))
        .withColumn("year", F.year(F.col("date_col")))
        .withColumn("month", F.month(F.col("date_col")))
        .withColumn("quarter", F.quarter(F.col("date_col")))
        .withColumn("WoY", F.weekofyear(F.col("date_col")))
        .withColumn("day", F.dayofmonth(F.col("date_col")))
    )
    return new_df


def process_collateral_info(df):
    """
    Extract collateral info dimension from bronze Spark dataframe.

    :param df: Spark bronze dataframe.
    :return new_df: silver type Spark dataframe.
    """
    new_df = (
        df.withColumn(
            "tmp_CS11", F.unix_timestamp(F.to_timestamp(F.col("CS11"), "yyyy-MM"))
        )
        .drop("CS11")
        .withColumnRenamed("tmp_CS11", "CS11")
        .withColumn(
            "tmp_CS12", F.unix_timestamp(F.to_timestamp(F.col("CS12"), "yyyy-MM"))
        )
        .drop("CS12")
        .withColumnRenamed("tmp_CS12", "CS12")
        .withColumn(
            "tmp_CS22", F.unix_timestamp(F.to_timestamp(F.col("CS22"), "yyyy-MM"))
        )
        .drop("CS22")
        .withColumnRenamed("tmp_CS22", "CS22")
    )
    return new_df


print("Start COLLATERAL SILVER job.")
run_props = set_job_params()
bronze_df = SPARK.read.parquet(
    f'{run_props["SOURCE_DIR"]}/bronze/collaterals.parquet'
)
print("Generate collateral info dataframe")
info_df = process_collateral_info(bronze_df)
print("Generate time dataframe")
date_df = process_dates(bronze_df, run_props["DATE_COLUMNS"])

print("Write dataframe")

(
    info_df.format("parquet")
    .partitionBy("year", "month", "day")
    .mode("overwrite")
    .save("../dataoutput/silver/collaterals/info_table.parquet")
)
(
    date_df.format("parquet")
    .mode("overwrite")
    .save("../dataoutput/silver/collaterals/date_table.parquet")
)