In [2]:
# !conda install -y -c conda-forge pyspark
# !conda install -y -c conda-forge black
# !conda install -y -c conda-forge pip
# !pip install delta-spark==2.1.1
# !sudo wget --user jovyan --password jovyan https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.1.0/delta-core_2.12-2.1.0.jar -P $SPARK_HOME/jars/


In [20]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
from pyspark.conf import SparkConf
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from functools import reduce
import json
import glob
import tempfile
from delta import *

builder = (
    SparkSession.builder.master("local[*]")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.1.0")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

# spark = SparkSession.builder \
#     .master("local") \
#     .getOrCreate()

In [4]:
all_files = all_files = glob.glob("../data/AssignmentData/*.csv")
all_files

['../data/AssignmentData/SMESES000065103320166_2016_07_29_Loan_Data_8396a20b-86e1-4cac-8d51-dbecb46130c6.csv',
 '../data/AssignmentData/SMESES000065103320166_2016_10_31_Loan_Data_5370b5fe-b398-4b40-ace5-2691fc3e1504.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_01_31_Loan_Data_6997f5f3-c2e5-41a4-a038-a5497066fe0e.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_04_30_Loan_Data_52e3e2b7-f8b6-49fd-b482-72a0445cfa9a.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_07_31_Loan_Data_d9629927-17f7-48c0-b1f6-ffbd213308d6.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_10_31_Loan_Data_fb648083-daad-4968-beaa-dfc851ec2b4a.csv',
 '../data/AssignmentData/SMESES000065103320166_2018_01_31_Loan_Data_464b3e03-7988-4c86-a729-a0d31f3e7622.csv',
 '../data/AssignmentData/SMESES000065103320166_2018_04_30_Loan_Data_3a6e2077-7e61-4b77-9dd6-6b4ea1794d05.csv',
 '../data/AssignmentData/SMESES000065103320166_2018_07_31_Loan_Data_af106c4d-a055-49a3-8f62-fede0acb7ce9.csv',
 

In [5]:
asset_columns = {'AS1':DateType(),
'AS2':StringType(),
'AS3':StringType(),
'AS4':StringType(),
'AS5':StringType(),
'AS6':StringType(),
'AS7':StringType(),
'AS8':StringType(),
'AS15':StringType(),
'AS16':StringType(),
'AS17':StringType(),
'AS18':StringType(),
'AS19':DateType(),
'AS20':DateType(),
'AS21':StringType(),
'AS22':StringType(),
'AS23':BooleanType(),
'AS24':StringType(),
'AS25':StringType(),
'AS26':StringType(),
'AS27':DoubleType(),
'AS28':DoubleType(),
'AS29':BooleanType(),
'AS30':DoubleType(),
'AS31':DateType(),
'AS32':StringType(),
'AS33':StringType(),
'AS34':StringType(),
'AS35':StringType(),
'AS36':StringType(),
'AS37':DoubleType(),
'AS38':DoubleType(),
'AS39':DoubleType(),
'AS40':DoubleType(),
'AS41':DoubleType(),
'AS42':StringType(),
'AS43':StringType(),
'AS44':DoubleType(),
'AS45':StringType(),
'AS50':DateType(),
'AS51':DateType(),
'AS52':StringType(),
'AS53':BooleanType(),
'AS54':DoubleType(),
'AS55':DoubleType(),
'AS56':DoubleType(),
'AS57':StringType(),
'AS58':StringType(),
'AS59':StringType(),
'AS60':DoubleType(),
'AS61':DoubleType(),
'AS62':StringType(),
'AS63':DoubleType(),
'AS64':DoubleType(),
'AS65':StringType(),
'AS66':DoubleType(),
'AS67':DateType(),
'AS68':StringType(),
'AS69':DoubleType(),
'AS70':DateType(),
'AS71':DateType(),
'AS80':DoubleType(),
'AS81':DoubleType(),
'AS82':DoubleType(),
'AS83':StringType(),
'AS84':StringType(),
'AS85':DoubleType(),
'AS86':DoubleType(),
'AS87':DateType(),
'AS88':DoubleType(),
'AS89':StringType(),
'AS90':DoubleType(),
'AS91':DateType(),
'AS92':StringType(),
'AS93':DoubleType(),
'AS94':StringType(),
'AS100':DoubleType(),
'AS101':DoubleType(),
'AS102':DoubleType(),
'AS103':DoubleType(),
'AS104':DoubleType(),
'AS105':DoubleType(),
'AS106':DoubleType(),
'AS107':DoubleType(),
'AS108':DoubleType(),
'AS109':DoubleType(),
'AS110':DoubleType(),
'AS111':StringType(),
'AS112':DateType(),
'AS115':DoubleType(),
'AS116':DoubleType(),
'AS117':DoubleType(),
'AS118':DoubleType(),
'AS119':DoubleType(),
'AS120':DoubleType(),
'AS121':BooleanType(),
'AS122':BooleanType(),
'AS123':StringType(),
'AS124':DateType(),
'AS125':DoubleType(),
'AS126':DoubleType(),
'AS127':DateType(),
'AS128':DoubleType(),
'AS129':StringType(),
'AS130':DateType(),
'AS131':BooleanType(),
'AS132':DoubleType(),
'AS133':DateType(),
'AS134':DateType(),
'AS135':DoubleType(),
'AS136':DoubleType(),
'AS137':DateType(),
'AS138':DoubleType()}

In [6]:
# Single file will not have all the columns specified in the DTS. Create an ad-hoc schema on the fly.
def get_file_schema(df,master_cols):
    schema = []
    for col_name in df.columns:
        if col_name.endswith("_reason"):
            schema.append(StructField(col_name, StringType(), True))
        else:
            root_col_name = col_name.split("_")[0]
            col_type = master_cols.get(root_col_name, False)
            if col_type:
                schema.append(StructField(col_name, col_type, True))
    return StructType(schema)

In [7]:
# # Find List columns and change numerical value to literal
# mapper_dict={
#     "Obligor Legal Form / Business Type":{1:"Public Company",2:"Limited Company",3:"Partnership",4:"Individul",5:"Other"},
#     "Customer Segment":{1:"Medium",2:"Small",3:"Micro",4:"Other"},
#     "Borrower Basel III Segment":{1:"Corporate",2:"SME treated as Corporate",3:"Retail",4:"Other"},
#     "Asset Type":{1: 'Loan',
#                 2: 'Guarantee',
#                 3: 'Promissory Notes',
#                 4: 'Participation Rights',
#                 5: 'Overdraft',
#                 6: 'Letter of Credit',
#                 7: 'Working Capital Facility',
#                 8: 'Other'},
#     "Seniority":{1: 'Senior Secured',
#                 2: 'Senior Unsecured',
#                 3: 'Junior',
#                 4: 'Junior Unsecured',
#                 5: 'Other'},
#     "Purpose":{1: 'Purchase',
#             2: 'Re-mortgage',
#             3: 'Renovation',
#             4: 'Equity release',
#             5: 'Construction Real Estate',
#             6: 'Construction Other',
#             7: 'Debt consolidation',
#             8: 'Re-mortgage with Equity Release',
#             9: 'Re-mortgage on Different Terms',
#             10: 'Combination Mortgage',
#             11: 'Investment Mortgage',
#             12: 'Working Capital',
#             13: 'Other'},
#     "Principal Payment Frequency":{1: 'Monthly',
#                                 2: 'Quarterly',
#                                 3: 'Semi annually',
#                                 4: 'Annual',
#                                 5: 'Bullet',
#                                 6: 'Other'},
#     "Interest Payment Frequency":{1: 'Linear',
#                             2: 'French',
#                             3: 'Fix Amortisation Schedule',
#                             4: 'Bullet',
#                             5: 'Partial Bullet',
#                             6: 'Revolving',
#                             7: 'Other'},
#     "Type of Loan":{1: 'Term', 2: 'Revolving Credit Line', 3: 'Other'},
#     "Payment type":{1: 'Direct Debit', 2: 'Standing Order', 3: 'Cheque', 4: 'Cash', 5: 'Other'},
#     "Interest Rate Type":{1: 'Floating rate loan for life',
#                         2: "Floating rate loan linked to Libor, Euribor, BoE reverting to the Bank's SVR, ECB reverting to Bank’s SVR",
#                         3: 'Fixed rate loan for life',
#                         4: 'Fixed with future periodic resets',
#                         5: 'Fixed rate loan with compulsory future switch to floating',
#                         6: 'Capped',
#                         7: 'Discount',
#                         8: 'Switch Optionality',
#                         9: 'Borrower Swapped',
#                         10: 'Other'},
#     "Current Interest Rate Index":{1: '1 month LIBOR',
#                                 2: '1 month EURIBOR',
#                                 3: '3 month LIBOR',
#                                 4: '3 month EURIBOR',
#                                 5: '6 month LIBOR',
#                                 6: '6 month EURIBOR',
#                                 7: '12 month LIBOR',
#                                 8: '12 month EURIBOR',
#                                 9: 'BoE Base Rate',
#                                 10: 'ECB Base Rate',
#                                 11: 'Standard Variable Rate',
#                                 12: 'Other'},
#     "Interest Reset Period":{1: 'Annual',
#                             2: 'Semi-annual',
#                             3: 'Quarterly',
#                             4: 'Monthly',
#                             5: 'Not apply',
#                             6: 'Other'},
#     "Reason for Default (Basel II definition)":{1: 'Bankruptcy / Insolvency',
#                                             2: 'Failure to Pay',
#                                             3: 'Breach of Terms',
#                                             4: 'Other'},
#     "Recovery Source":{1: 'Bankruptcy / Insolvency',
#                         2: 'Failure to Pay',
#                         3: 'Breach of Terms',
#                         4: 'Other'},
#     "Recovery Source":{1: 'Liquidation of Collateral',
#                         2: 'Enforcement of Guarantees',
#                         3: 'Additional Lending',
#                         4: 'Cash Recoveries',
#                         5: 'Mixed',
#                         6: 'Other'},
# }

In [8]:
import csv 
list_dfs = []
for csv_f in all_files:
    col_names = []
    content = []
    with open(csv_f,'r') as f:
        for i, line in enumerate(csv.reader(f)):
            if i == 0:
                col_names = line
            elif i == 1:
                continue
            else:
                content.append(line)
        df = spark.createDataFrame(content, col_names)
        list_dfs.append(df)
assets_df = reduce(DataFrame.union, list_dfs)

In [9]:
def replace_no_data(df):
    # For ND  values
    for col_name in df.columns:
        df = df.withColumn(col_name, F.when(F.col(col_name).startswith("ND"), None)
                                    .otherwise(F.col(col_name)))
    return df

test_1 = replace_no_data(assets_df)

test_1.show()

+----------+------+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+-----+----+----+----+-------+-------+----+----+----------+---------+---------+----+----+----+----+----------+----+-------+-------+----+----+----------+----+----+-------+----+----------+-----------+----------+----+----+----------+----+----+----+----+----+----+----+----------+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|       AS1|   AS3|AS15|AS16| AS17|AS18|AS19|AS20|AS21|AS22|AS23|AS24|AS25|AS26|AS27|AS28|AS29|AS30|AS31|AS32|AS33|AS34|AS35|AS36| AS37|AS38|AS39|AS40|AS41| AS42|AS43|AS44|AS45|   AS50|   AS51|AS52|AS53|      AS54|     AS55|     AS56|AS57|AS58|AS59|AS60|      AS61|AS62|   AS63|   AS64|AS65|AS66|      AS67|AS68|AS69|   AS70|AS71|      AS80|

In [10]:
def replace_bool_data(df):
    # For  boolean values
    for col_name in df.columns:
        df = df.withColumn(col_name, F.when(F.col(col_name) == "Y", "True")
                                    .when(F.col(col_name) == "N", "False")
                                    .otherwise(F.col(col_name)))
    return df
test_2 = replace_bool_data(test_1)
test_2.show()

+----------+------+----+----+-----+----+----+----+----+----+-----+----+----+----+----+----+-----+----+----+----+----+----+----+----+-----+----+----+----+----+-----+----+----+----+-------+-------+----+-----+----------+---------+---------+----+----+----+----+----------+----+-------+-------+----+----+----------+----+----+-------+----+----------+-----------+----------+----+----+----------+----+----+----+----+----+----+----+----------+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|       AS1|   AS3|AS15|AS16| AS17|AS18|AS19|AS20|AS21|AS22| AS23|AS24|AS25|AS26|AS27|AS28| AS29|AS30|AS31|AS32|AS33|AS34|AS35|AS36| AS37|AS38|AS39|AS40|AS41| AS42|AS43|AS44|AS45|   AS50|   AS51|AS52| AS53|      AS54|     AS55|     AS56|AS57|AS58|AS59|AS60|      AS61|AS62|   AS63|   AS64|AS65|AS66|      AS67|AS68|AS69|   AS70|AS71|     

In [11]:
def cast_to_datatype(df, columns):
    for col_name, data_type in columns.items():
        if data_type == BooleanType():
            df=df.withColumn("tmp_col_name", F.col(col_name).contains("True")).drop(col_name).withColumnRenamed("tmp_col_name", col_name)
        if data_type == DateType():
           df=df.withColumn("tmp_col_name", F.to_date(F.col(col_name))).drop(col_name).withColumnRenamed("tmp_col_name", col_name)
        if data_type == DoubleType():
           df=df.withColumn("tmp_col_name", F.round(F.col(col_name).cast(DoubleType()),2)).drop(col_name).withColumnRenamed("tmp_col_name", col_name) 
    return df

test_3 = cast_to_datatype(test_2, asset_columns)
test_3.printSchema()

root
 |-- AS3: string (nullable = true)
 |-- AS15: string (nullable = true)
 |-- AS16: string (nullable = true)
 |-- AS17: string (nullable = true)
 |-- AS18: string (nullable = true)
 |-- AS21: string (nullable = true)
 |-- AS22: string (nullable = true)
 |-- AS24: string (nullable = true)
 |-- AS25: string (nullable = true)
 |-- AS26: string (nullable = true)
 |-- AS32: string (nullable = true)
 |-- AS33: string (nullable = true)
 |-- AS34: string (nullable = true)
 |-- AS35: string (nullable = true)
 |-- AS36: string (nullable = true)
 |-- AS42: string (nullable = true)
 |-- AS43: string (nullable = true)
 |-- AS45: string (nullable = true)
 |-- AS52: string (nullable = true)
 |-- AS57: string (nullable = true)
 |-- AS58: string (nullable = true)
 |-- AS59: string (nullable = true)
 |-- AS62: string (nullable = true)
 |-- AS65: string (nullable = true)
 |-- AS68: string (nullable = true)
 |-- AS83: string (nullable = true)
 |-- AS84: string (nullable = true)
 |-- AS89: string (nulla

In [12]:
test_3.show(n=5,truncate=False)

+------+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+-----+-----+----------+----+----+-----+----+----+-----+----+----+----+----+----+----+----+----+----------+----------+-----+---------+---------+---------+----+----+-------+-------+----+----------+----+----------+----+----+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|AS3   |AS15|AS16|AS17 |AS18|AS21|AS22|AS24|AS25|AS26|AS32|AS33|AS34|AS35|AS36|AS42 |AS43|AS45|AS52|AS57|AS58|AS59|AS62|AS65|AS68|AS83|AS84|AS89|AS92|AS94|AS111|AS123|AS129|AS1       |AS19|AS20|AS23 |AS27|AS28|AS29 |AS30|AS31|AS37|AS38|AS39|AS40|AS41|AS44|AS50      |AS51      |AS53 |AS54     |AS55     |AS56     |AS60|AS61|AS63   |AS64   |AS66|AS67      |AS69|AS70   

In [24]:
# Unable to use Delta Lake locally due to concurrency errors
# test_3.write.mode("overwrite").format("delta").save("/home/jovyan/work/data/output/bronze/asset_bronze.delta")
test_3.write.parquet("../data/output/bronze/asset_bronze.parquet")

## Attempt to read XML Deal Detail file

In [26]:
# !conda install -y -c conda-forge lxml

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libxslt-1.1.35             |       h1116d7b_1         582 KB  conda-forge
    lxml-4.9.1                 |  py310h5764c6d_0         1.4 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.9 MB

The following NEW packages will be INSTALLED:

  libxslt            conda-forge/linux-64::libxslt-1.1.35-h1116d7b_1 None
  lxml               conda-forge/linux-64::lxml-4.9.1-py310h5764c6d_0 None



Downloading and Extracting Packages
lxml-4.9.1           | 1.4 MB    | ##################################### | 100% 
libxslt-1.1.35       | 582 KB    | ##################################### |

In [34]:
deal_detail_file = "../data/SMESIT000211100520189/SMESIT000211100520189_Deal_Details.xml"
df = pd.read_xml(deal_detail_file,  xpath="./s:Envelope/s:Body/SearchDealResponse/Deals/DealDetail",namespaces={"s":"http://www.w3.org/2003/05/soap-envelope"} )
df.head()

ValueError: xpath does not return any nodes. Be sure row level nodes are in xpath. If document uses namespaces denoted with xmlns, be sure to define namespaces and use them in xpath.