# Main ETL for Loan Level Data

## Initial set up

In [1]:
import glob
import sys
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.types import DateType, StringType, DoubleType, BooleanType, TimestampType
import csv
from functools import reduce
from lxml import objectify
import pandas as pd
import os

SPARK = SparkSession.builder.master("local[*]").getOrCreate()
BRONZE_SOURCE_DIR = "../data/mini_source"

### ASSET BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Loan_Data"
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: list of desired files from source_dir.
    """
    all_files = [
        f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv") if "Labeled0M" not in f
    ]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark datafram for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            csv_id = csv_f.split("/")[-1].split("_")[0]
            csv_date ="-".join(csv_f.split("/")[-1].split("_")[1:4])
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                    col_names[0] = "AS1"
                elif i == 1:
                    continue
                else:
                    content.append(line)
            df = (
                    spark.createDataFrame(content, col_names)
                    .withColumn("ed_code", F.lit(csv_id))
                    .replace("",None)
                    .withColumn("ImportDate", F.lit(csv_date))
                    .withColumn("year", F.year(F.col("ImportDate")))
                    .withColumn("month", F.month(F.col("ImportDate")))
                    .withColumn("valid_from", F.lit(F.current_timestamp()).cast(TimestampType()))\
                    .withColumn("valid_to", F.lit("").cast(TimestampType()))\
                    .withColumn("iscurrent", F.lit(1).cast("int"))\
                    .withColumn("checksum",F.md5(F.concat(F.col("AS1"),F.col("AS2"),F.col("AS3"),F.col("AS4"),F.col("AS5"),F.col("AS6"),F.col("AS7"),)))
                    .drop("ImportDate")
                )
            list_dfs.append(df)
    if list_dfs == []:
        print("No dataframes were extracted from files. Exit process!")
        sys.exit(1)
    return reduce(DataFrame.union, list_dfs)



print("Start ASSETS BRONZE job.")
run_props = set_job_params()
all_asset_files = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
print(f"Retrieved {len(all_asset_files)} asset data files.")
raw_asset_df = create_dataframe(SPARK, all_asset_files)
(
    raw_asset_df.write
    .partitionBy("year", "month")
    .mode("append")
    .parquet("../data/output/bronze/assets.parquet")
)


### COLLATERAL BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Collateral"
    # config["COLLATERAL_COLUMNS"] = {
    #     "CS1": StringType(),
    #     "CS2": StringType(),
    #     "CS3": StringType(),
    #     "CS4": DoubleType(),
    #     "CS5": DoubleType(),
    #     "CS6": StringType(),
    #     "CS7": BooleanType(),
    #     "CS8": BooleanType(),
    #     "CS9": BooleanType(),
    #     "CS10": DoubleType(),
    #     "CS11": DateType(),
    #     "CS12": DateType(),
    #     "CS13": StringType(),
    #     "CS14": StringType(),
    #     "CS15": DoubleType(),
    #     "CS16": StringType(),
    #     "CS17": StringType(),
    #     "CS18": DoubleType(),
    #     "CS19": DoubleType(),
    #     "CS20": StringType(),
    #     "CS21": DoubleType(),
    #     "CS22": DateType(),
    #     "CS23": StringType(),
    #     "CS24": StringType(),
    #     "CS25": StringType(),
    #     "CS26": StringType(),
    #     "CS27": StringType(),
    #     "CS28": DoubleType(),
    # }
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark dataframe for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            csv_id = csv_f.split("/")[-1].split("_")[0]
            csv_date ="-".join(csv_f.split("/")[-1].split("_")[1:4])
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                    col_names[0] = "CS1"
                elif i == 1:
                    continue
                else:
                    content.append(line)
            df = (
                spark.createDataFrame(content, col_names)
                .withColumn("ed_code", F.lit(csv_id))
                .replace("",None)
                .withColumn("ImportDate", F.lit(csv_date))
                .withColumn("year", F.year(F.col("ImportDate")))
                .withColumn("month", F.month(F.col("ImportDate")))
                .withColumn("valid_from", F.lit(F.current_timestamp()).cast(TimestampType()))\
                .withColumn("valid_to", F.lit("").cast(TimestampType()))\
                .withColumn("iscurrent", F.lit(1).cast("int"))\
                .withColumn("checksum",F.md5(F.concat(F.col("CS1"),F.col("CS2"))))
                .drop("ImportDate")
            )
            list_dfs.append(df)
    if list_dfs == []:
        print("No dataframes were extracted from files. Exit process!")
        sys.exit(1)
    return reduce(DataFrame.union, list_dfs)


# def replace_no_data(df):
#     """
#     Replace ND values inside the dataframe
#     TODO: ND are associated with labels that explain why the vaue is missing.
#           Should handle this information better in future releases.
#     :param df: Spark dataframe with loan asset data.
#     :return df: Spark dataframe without ND values.
#     """
#     for col_name in df.columns:
#         df = df.withColumn(
#             col_name,
#             F.when(F.col(col_name).startswith("ND"), None).otherwise(F.col(col_name)),
#         )
#     return df


# def replace_bool_data(df):
#     """
#     Replace Y/N with boolean flags in the dataframe.

#     :param df: Spark dataframe with loan asset data.
#     :return df: Spark dataframe without Y/N values.
#     """
#     for col_name in df.columns:
#         df = df.withColumn(
#             col_name,
#             F.when(F.col(col_name) == "Y", "True")
#             .when(F.col(col_name) == "N", "False")
#             .otherwise(F.col(col_name)),
#         )
#     return df


# def cast_to_datatype(df, columns):
#     """
#     Cast data to the respective datatype.

#     :param df: Spark dataframe with loan asset data.
#     :param columns: collection of column names and respective data types.
#     :return df: Spark dataframe with correct values.
#     """
#     for col_name, data_type in columns.items():
#         if data_type == BooleanType():
#             df = (
#                 df.withColumn("tmp_col_name", F.col(col_name).contains("True"))
#                 .drop(col_name)
#                 .withColumnRenamed("tmp_col_name", col_name)
#             )
#         if data_type == DateType():
#             df = (
#                 df.withColumn("tmp_col_name", F.to_date(F.col(col_name)))
#                 .drop(col_name)
#                 .withColumnRenamed("tmp_col_name", col_name)
#             )
#         if data_type == DoubleType():
#             df = (
#                 df.withColumn(
#                     "tmp_col_name", F.round(F.col(col_name).cast(DoubleType()), 2)
#                 )
#                 .drop(col_name)
#                 .withColumnRenamed("tmp_col_name", col_name)
#             )
#     return df


print("Start COLLATERAL BRONZE job.")
run_props = set_job_params()
all_collateral_files = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
print(f"Retrieved {len(all_collateral_files)} collateral data files.")
raw_collateral_df = create_dataframe(SPARK, all_collateral_files)
# print("Remove ND values.")
# tmp_df1 = replace_no_data(raw_collateral_df)
# print("Replace Y/N with boolean flags.")
# tmp_df2 = replace_bool_data(tmp_df1)
# print("Cast data to correct types.")
# final_df = cast_to_datatype(tmp_df2, run_props["COLLATERAL_COLUMNS"])
# (
#     final_df.write
#     .partitionBy("year", "month", "day")
#     .mode("append")
#     .parquet("../data/output/bronze/collaterals.parquet")
# )
(
    raw_collateral_df.write
    .partitionBy("year", "month")
    .mode("append")
    .parquet("../data/output/bronze/collaterals.parquet")
)

### BOND INFO BRONZE

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Bond_Info"
    # config["BOND_COLUMNS"] = {
    #     "BS1": DateType(),
    #     "BS2": StringType(),
    #     "BS3": DoubleType(),
    #     "BS4": DoubleType(),
    #     "BS5": BooleanType(),
    #     "BS6": StringType(),
    #     "BS11": DoubleType(),
    #     "BS12": BooleanType(),
    #     "BS13": DoubleType(),
    #     "BS19": StringType(),
    #     "BS20": StringType(),
    # }
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark datafram for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            csv_id = csv_f.split("/")[-1].split("_")[0]
            csv_date ="-".join(csv_f.split("/")[-1].split("_")[1:4])
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                    col_names[0] = "BS1"
                elif i == 1:
                    continue
                else:
                    content.append(line)
            df = (
                spark.createDataFrame(content, col_names)
                .withColumn("ed_code", F.lit(csv_id))
                .replace("",None)
                .withColumn("ImportDate", F.lit(csv_date))
                .withColumn("year", F.year(F.col("ImportDate")))
                .withColumn("month", F.month(F.col("ImportDate")))
                .withColumn("valid_from", F.lit(F.current_timestamp()).cast(TimestampType()))\
                .withColumn("valid_to", F.lit("").cast(TimestampType()))\
                .withColumn("iscurrent", F.lit(1).cast("int"))\
                .withColumn("checksum",F.md5(F.concat(F.col("BS1"),F.col("BS2"))))
                .drop("ImportDate")
            )
            list_dfs.append(df)
    if list_dfs == []:
        print("No dataframes were extracted from files. Exit process!")
        sys.exit(1)
    return reduce(DataFrame.union, list_dfs)


# def replace_no_data(df):
#     """
#     Replace ND values inside the dataframe
#     TODO: ND are associated with labels that explain why the vaue is missing.
#           Should handle this information better in future releases.
#     :param df: Spark dataframe with loan asset data.
#     :return df: Spark dataframe without ND values.
#     """
#     for col_name in df.columns:
#         df = df.withColumn(
#             col_name,
#             F.when(F.col(col_name).startswith("ND"), None).otherwise(F.col(col_name)),
#         )
#     return df


# def replace_bool_data(df):
#     """
#     Replace Y/N with boolean flags in the dataframe.

#     :param df: Spark dataframe with loan asset data.
#     :return df: Spark dataframe without Y/N values.
#     """
#     for col_name in df.columns:
#         df = df.withColumn(
#             col_name,
#             F.when(F.col(col_name) == "Y", "True")
#             .when(F.col(col_name) == "N", "False")
#             .otherwise(F.col(col_name)),
#         )
#     return df


# def cast_to_datatype(df, columns):
#     """
#     Cast data to the respective datatype.

#     :param df: Spark dataframe with loan asset data.
#     :param columns: collection of column names and respective data types.
#     :return df: Spark dataframe with correct values.
#     """
#     for col_name, data_type in columns.items():
#         if data_type == BooleanType():
#             df = (
#                 df.withColumn("tmp_col_name", F.col(col_name).contains("True"))
#                 .drop(col_name)
#                 .withColumnRenamed("tmp_col_name", col_name)
#             )
#         if data_type == DateType():
#             df = (
#                 df.withColumn("tmp_col_name", F.to_date(F.col(col_name)))
#                 .drop(col_name)
#                 .withColumnRenamed("tmp_col_name", col_name)
#             )
#         if data_type == DoubleType():
#             df = (
#                 df.withColumn(
#                     "tmp_col_name", F.round(F.col(col_name).cast(DoubleType()), 2)
#                 )
#                 .drop(col_name)
#                 .withColumnRenamed("tmp_col_name", col_name)
#             )
#     return df


print("Start BOND INFO BRONZE job.")
run_props = set_job_params()
all_bond_info_files = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
print(f"Retrieved {len(all_bond_info_files)} bond info data files.")
raw_bond_info_df = create_dataframe(SPARK, all_bond_info_files)
# print("Remove ND values.")
# tmp_df1 = replace_no_data(raw_bond_info_df)
# print("Replace Y/N with boolean flags.")
# tmp_df2 = replace_bool_data(tmp_df1)
# print("Cast data to correct types.")
# final_df = cast_to_datatype(tmp_df2, run_props["BOND_COLUMNS"])
# (
#     final_df.write
#     .partitionBy("year", "month", "day")
#     .mode("append")
#     .parquet("../data/output/bronze/bond_info.parquet")
# )
(
    raw_bond_info_df.write
    .partitionBy("year", "month")
    .mode("append")
    .parquet("../data/output/bronze/bond_info.parquet")
)

### AMORTISATION BRONZE

To be reviewed since even one portfolio will bring it OutOfMemory

In [2]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Amortization"
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/*{file_key}*.csv")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(spark, all_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param spark: SparkSession object.
    :param all_files: list of files to be read to generate the dtaframe.
    :return df: PySpark dataframe for loan asset data.
    """
    list_dfs = []
    for csv_f in all_files:
        col_names = []
        content = []
        with open(csv_f, "r") as f:
            csv_id = csv_f.split("/")[-1].split("_")[0]
            csv_date ="-".join(csv_f.split("/")[-1].split("_")[1:4])
            for i, line in enumerate(csv.reader(f)):
                if i == 0:
                    col_names = line
                    col_names[0] = "AS3" #visible error from EDW where first columns has different names
                elif i == 1:
                    continue
                else:
                    content.append(line) # fix empty "" because they are messy
            df = (
                    spark.createDataFrame(content, col_names)
                    .withColumn("ed_code", F.lit(csv_id))
                    .replace("",None)
                    .withColumn("ImportDate", F.lit(csv_date))
                    .withColumn("year", F.year(F.col("ImportDate")))
                    .withColumn("month", F.month(F.col("ImportDate")))
                    .withColumn("valid_from", F.lit(F.current_timestamp()).cast(TimestampType()))\
                    .withColumn("valid_to", F.lit("").cast(TimestampType()))\
                    .withColumn("iscurrent", F.lit(1).cast("int"))\
                    .withColumn("checksum",F.md5(F.col("AS3")))
                    .drop("ImportDate")
                )
            list_dfs.append(df)
    return reduce(DataFrame.union, list_dfs)



print("Start AMORTISATION BRONZE job.")
run_props = set_job_params()
all_amortisation_files = get_raw_files(
    run_props["SOURCE_DIR"], run_props["FILE_KEY"]
)
print(f"Retrieved {len(all_amortisation_files)} amortisation data files.")
raw_amortisation_df = (
            create_dataframe(SPARK, all_amortisation_files[:1])
        )

(
    raw_amortisation_df.write
    .partitionBy("year", "month")
    .mode("append")
    .parquet("../data/output/bronze/amortisation.parquet")
)


Start AMORTISATION BRONZE job.
Retrieved 19 amortisation data files.


### DEAL DETAILS BRONZE
To be reviewed since it is unclear how to store data.

In [None]:
def set_job_params():
    """
    Setup parameters used for this module.

    :return config: dictionary with properties used in this job.
    """
    config = {}
    config["SOURCE_DIR"] = BRONZE_SOURCE_DIR
    config["FILE_KEY"] = "Deal_Details"
    return config


def get_raw_files(source_dir, file_key):
    """
    Return list of files that satisfy the file_key parameter.
    Works only on local machine so far.

    :param source_dir: folder path where files are stored.
    :param file_key: label for file name that helps with the cherry picking.
    :return all_files: listof desired files from source_dir.
    """
    all_files = [f for f in glob.glob(f"{source_dir}/*/*{file_key}*.xml")]
    if len(all_files) == 0:
        print(
            f"No files with key {file_key.upper()} found in {source_dir}. Exit process!"
        )
        sys.exit(1)
    else:
        return all_files


def create_dataframe(deal_detail_files):
    """
    Read files and generate one PySpark DataFrame from them.

    :param deal_detail_files: files to be read to generate the dataframe.
    :return df: PySpark datafram for loan asset data.
    """
    list_dfs = []
    for xml_f in deal_detail_files:
        xml_data = objectify.parse(xml_f)  # Parse XML data
        root = xml_data.getroot()  # Root element

        data = []
        cols = []
        for i in range(
            len(
                root.getchildren()[1]
                .getchildren()[0]
                .getchildren()[1]
                .getchildren()[0]
                .getchildren()
            )
        ):
            child = (
                root.getchildren()[1]
                .getchildren()[0]
                .getchildren()[1]
                .getchildren()[0]
                .getchildren()[i]
            )
            tag = child.tag.replace("{http://edwin.eurodw.eu/EDServices/2.3}", "")
            if tag == "ISIN":
                # is array
                data.append(";".join(map(str,child.getchildren())))
            elif tag in ["Country", "DealVisibleToOrg", "DealVisibleToUser", "Submissions"]:
                # usually null values
                #TODO: Submissions might have interesting stuff. Ask to Luca.
                continue
            else:
                data.append(child.text)
            cols.append(tag)

        df = pd.DataFrame(data).T  # Create DataFrame and transpose it
        df.columns = cols  # Update column names
        df["valid_from"] = pd.Timestamp.now() 
        df["valid_to"] = None
        df["iscurrent"] = 1
        df["checksum"] =pd.util.hash_pandas_object(df["EDCode"])
        list_dfs.append(df)
    
    return pd.concat(list_dfs, ignore_index=True)


print("Start DEAL DETAILS BRONZE job.")
run_props = set_job_params()
all_xml_files = get_raw_files(run_props["SOURCE_DIR"], run_props["FILE_KEY"])
raw_deal_details_df = create_dataframe(all_xml_files)

# (
#     SPARK.createDataFrame(raw_deal_details_df).write
#     .mode("append")
#     .parquet("../data/output/bronze/deal_details.parquet")
# )
(
    raw_deal_details_df
    .to_csv(
        "../data/output/bronze/deal_details.csv", 
        mode='a', 
        header=not os.path.exists("../data/output/bronze/deal_details.csv"
        )
    )
)