In [36]:
# !conda install -y -c conda-forge pyspark
# !conda install -y -c conda-forge black

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - black


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    black-22.10.0              |  py310hff52083_0         233 KB  conda-forge
    mypy_extensions-0.4.3      |  py310hff52083_5          11 KB  conda-forge
    pathspec-0.10.1            |     pyhd8ed1ab_0          35 KB  conda-forge
    platformdirs-2.5.2         |     pyhd8ed1ab_1          16 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         294 KB

The following NEW packages will be INSTALLED:

  black              conda-forge/linux-64::black-22.10.0-py310hff52083_0 None
  mypy_extensions    conda-forge/linux-64::mypy_extensions-0.4.3-py310hff52083_5 None
  pathspec           c

In [6]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
from pyspark.conf import SparkConf
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from functools import reduce
import json
import glob

spark = SparkSession.builder \
    .master("local") \
    .getOrCreate()

In [8]:
all_files = all_files = glob.glob("../data/AssignmentData/*.csv")
all_files

['../data/AssignmentData/SMESES000065103320166_2016_07_29_Loan_Data_8396a20b-86e1-4cac-8d51-dbecb46130c6.csv',
 '../data/AssignmentData/SMESES000065103320166_2016_10_31_Loan_Data_5370b5fe-b398-4b40-ace5-2691fc3e1504.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_01_31_Loan_Data_6997f5f3-c2e5-41a4-a038-a5497066fe0e.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_04_30_Loan_Data_52e3e2b7-f8b6-49fd-b482-72a0445cfa9a.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_07_31_Loan_Data_d9629927-17f7-48c0-b1f6-ffbd213308d6.csv',
 '../data/AssignmentData/SMESES000065103320166_2017_10_31_Loan_Data_fb648083-daad-4968-beaa-dfc851ec2b4a.csv',
 '../data/AssignmentData/SMESES000065103320166_2018_01_31_Loan_Data_464b3e03-7988-4c86-a729-a0d31f3e7622.csv',
 '../data/AssignmentData/SMESES000065103320166_2018_04_30_Loan_Data_3a6e2077-7e61-4b77-9dd6-6b4ea1794d05.csv',
 '../data/AssignmentData/SMESES000065103320166_2018_07_31_Loan_Data_af106c4d-a055-49a3-8f62-fede0acb7ce9.csv',
 

In [107]:
# Find List columns and change numerical value to literal
mapper_dict={
    "Obligor Legal Form / Business Type":{1:"Public Company",2:"Limited Company",3:"Partnership",4:"Individul",5:"Other"},
    "Customer Segment":{1:"Medium",2:"Small",3:"Micro",4:"Other"},
    "Borrower Basel III Segment":{1:"Corporate",2:"SME treated as Corporate",3:"Retail",4:"Other"},
    "Asset Type":{1: 'Loan',
                2: 'Guarantee',
                3: 'Promissory Notes',
                4: 'Participation Rights',
                5: 'Overdraft',
                6: 'Letter of Credit',
                7: 'Working Capital Facility',
                8: 'Other'},
    "Seniority":{1: 'Senior Secured',
                2: 'Senior Unsecured',
                3: 'Junior',
                4: 'Junior Unsecured',
                5: 'Other'},
    "Purpose":{1: 'Purchase',
            2: 'Re-mortgage',
            3: 'Renovation',
            4: 'Equity release',
            5: 'Construction Real Estate',
            6: 'Construction Other',
            7: 'Debt consolidation',
            8: 'Re-mortgage with Equity Release',
            9: 'Re-mortgage on Different Terms',
            10: 'Combination Mortgage',
            11: 'Investment Mortgage',
            12: 'Working Capital',
            13: 'Other'},
    "Principal Payment Frequency":{1: 'Monthly',
                                2: 'Quarterly',
                                3: 'Semi annually',
                                4: 'Annual',
                                5: 'Bullet',
                                6: 'Other'},
    "Interest Payment Frequency":{1: 'Linear',
                            2: 'French',
                            3: 'Fix Amortisation Schedule',
                            4: 'Bullet',
                            5: 'Partial Bullet',
                            6: 'Revolving',
                            7: 'Other'},
    "Type of Loan":{1: 'Term', 2: 'Revolving Credit Line', 3: 'Other'},
    "Payment type":{1: 'Direct Debit', 2: 'Standing Order', 3: 'Cheque', 4: 'Cash', 5: 'Other'},
    "Interest Rate Type":{1: 'Floating rate loan for life',
                        2: "Floating rate loan linked to Libor, Euribor, BoE reverting to the Bank's SVR, ECB reverting to Bank’s SVR",
                        3: 'Fixed rate loan for life',
                        4: 'Fixed with future periodic resets',
                        5: 'Fixed rate loan with compulsory future switch to floating',
                        6: 'Capped',
                        7: 'Discount',
                        8: 'Switch Optionality',
                        9: 'Borrower Swapped',
                        10: 'Other'},
    "Current Interest Rate Index":{1: '1 month LIBOR',
                                2: '1 month EURIBOR',
                                3: '3 month LIBOR',
                                4: '3 month EURIBOR',
                                5: '6 month LIBOR',
                                6: '6 month EURIBOR',
                                7: '12 month LIBOR',
                                8: '12 month EURIBOR',
                                9: 'BoE Base Rate',
                                10: 'ECB Base Rate',
                                11: 'Standard Variable Rate',
                                12: 'Other'},
    "Interest Reset Period":{1: 'Annual',
                            2: 'Semi-annual',
                            3: 'Quarterly',
                            4: 'Monthly',
                            5: 'Not apply',
                            6: 'Other'},
    "Reason for Default (Basel II definition)":{1: 'Bankruptcy / Insolvency',
                                            2: 'Failure to Pay',
                                            3: 'Breach of Terms',
                                            4: 'Other'},
    "Recovery Source":{1: 'Bankruptcy / Insolvency',
                        2: 'Failure to Pay',
                        3: 'Breach of Terms',
                        4: 'Other'},
    "Recovery Source":{1: 'Liquidation of Collateral',
                        2: 'Enforcement of Guarantees',
                        3: 'Additional Lending',
                        4: 'Cash Recoveries',
                        5: 'Mixed',
                        6: 'Other'},
}

In [145]:
# Change NoData value with NA and create a "*_reason" column with literal explanation
# Value	Reason	Example
# 1	Data not collected as not required by the underwriting criteria	ND,1
# 2	Data collected at application but not loaded in the reporting system at completion	ND,2
# 3	Data collected at application but loaded in a separate system from the reporting one	ND,3
# 4	Data collected but will only be available from YYYY-MM	ND,4,YYYY-MM
# 5	Not relevant at the present time	ND,5
# 6	Not applicable for the jurisdiction (see table in 'Assets' and 'Collateral' sheets)	ND,6

def replace_no_data(df):
    no_data_explanations = {'1':"Data not collected as not required by the underwriting criteria",
                        '2':"Data collected at application but not loaded in the reporting system at completion",
                        '3':"Data collected at application but loaded in a separate system from the reporting one",
                        '4':"Data collected but will only be available in later releases",
                        '5':"Not relevant at the present time",
                        '6':"Not applicable for the jurisdiction",
    }
    col_names_to_drop = []
    for col_name in df.columns:
        if sum([str(v).startswith("ND") for v in df[col_name].unique()])>0:
            col_names_to_drop.append(col_name)
            value_col_name = f"{col_name}_value"
            reason_col_name = f"{col_name}_reason"
            df[[value_col_name, reason_col_name]] = df[col_name].str.split(",", expand=True)[[0,1]]
            df[reason_col_name]=df[reason_col_name].map(no_data_explanations)
    df.drop(col_names_to_drop,axis=1,inplace=True)
    df.replace("ND", np.nan, inplace=True)
    df.fillna(np.nan, inplace=True)
    return df

In [50]:
# desc="""Liquidation of Collateral (1)
# Enforcement of Guarantees (2)
# Additional Lending (3)
# Cash Recoveries (4)
# Mixed (5)
# Other (6)
# No Data (ND)"""

# di = {}
# for l in desc.split("\n"):
#     try:
#         index = int(l[l.find('(')+1:l.find(')')])
#         value = l[:l.find('(')].strip()
#         di[index]=value
#     except:
#         continue
# di

{1: 'Liquidation of Collateral',
 2: 'Enforcement of Guarantees',
 3: 'Additional Lending',
 4: 'Cash Recoveries',
 5: 'Mixed',
 6: 'Other'}

Obligor Incorporation Date
Obligor is a Customer since?
Customer segment
Borrower Basel III Segment
Obligor Tax Code
Last Internal Obligor Rating Review
S&P Public Rating (equivalent)
Moody's Public Rating (equivalent)
Fitch Public Rating (equivalent)
Dominion Bond Rating Service (DBRS) Public Rating (equivalent)
Other Public Rating
S&P Industry Code
Moody's Industry Code
Fitch Industry Code
Other Industry Code
Principal Grace Period End Date
Interest Grace Period End Date
Interest Cap Rate
Interest Floor Rate
Revision Margin 1
Interest Revision Date 1
Revision Margin 2
Interest Revision Date 2
Revision Margin 3
Interest Revision Date 3
Reason for Default (Basel II definition)
Default Date
Bank Internal Rating Prior to Default
Legal Proceedings Start Date
Cumulative Recoveries
Recovery Source
Work-out Process Started
Work-out Process Complete
Redemption Date
Date Loss Allocated
Real Estate Sale Price
Total Proceeds from Other Collateral or Guarantees
Date of End of Work-out
Foreclosure Cost


In [146]:
df = pd.read_csv('../data/AssignmentData/SMESES000065103320166_2020_10_31_Loan_Data_54202492-ea0e-4425-a330-3e8b787c4b1d.csv',header=1)
mapped_df = df.replace(mapper_dict)
clened_df = replace_no_data(mapped_df)

In [149]:
clened_df["Borrower Basel III Segment_reason"].unique()

array([nan, 'Not relevant at the present time'], dtype=object)