# 1.0 Data preparation

In [1]:
import numpy as np
import pandas as pd
import math
import base64

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Read the data file and take a look at the data
df = pd.read_csv('../data/raw/lender_data/lender_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,RSSDHCR,OriginatingLender,OriginatingLenderCity,OriginatingLenderLocationID,OriginatingLenderState,CurrentApprovalAmount,ForgivenessAmount,ForgivnessPct,NAMEHCR,UNINUM,NM_LGL,NM_SHORT,CERT
0,0,1020395.0,CCB COMMUNITY BANK,ANDALUSIA,26.0,AL,21420366.01,21224047.72,0.990835,SOUTHERN NATIONAL CORP,10739.0,SOUTHERN NATIONAL CORPORATION ...,SOUTHERN NAT CORP,16595.0
1,1,1020582.0,WOODTRUST BANK,WISCONSIN RAPIDS,77051.0,WI,31543689.32,31834545.84,1.009221,WOODTRUST FINANCIAL CORP,3599.0,WOODTRUST FINANCIAL CORPORATION ...,WOODTRUST FC,5335.0
2,2,1020667.0,NORTHWEST BANK & TRUST COMPANY,DAVENPORT,24824.0,IA,26963725.0,26824637.15,0.994842,NORTHWEST INVESTMENT CORP,10071.0,NORTHWEST INVESTMENT CORP. ...,NORTHWEST INV CORP,15830.0
3,3,1020676.0,AMALGAMATED BANK OF CHICAGO,CHICAGO,20494.0,IL,3049177.89,3036048.99,0.995694,AMALGAMATED INVESTMENTS CO,581.0,AMALGAMATED INVESTMENTS COMPANY ...,AMALGAMATED INV CO,903.0
4,4,1020854.0,FIRSTBANK OF NEBRASKA,WAHOO,42693.0,NE,10426743.02,10401369.48,0.997566,BANK MANAGEMENT INC,3718.0,"BANK MANAGEMENT, INC. ...",BANK MGMT,5486.0


In [4]:
df.columns

Index(['Unnamed: 0', 'RSSDHCR', 'OriginatingLender', 'OriginatingLenderCity',
       'OriginatingLenderLocationID', 'OriginatingLenderState',
       'CurrentApprovalAmount', 'ForgivenessAmount', 'ForgivnessPct',
       'NAMEHCR', 'UNINUM', 'NM_LGL', 'NM_SHORT', 'CERT'],
      dtype='object')

In [5]:
# Rename column names with underscores
columns_dict = {"RSSDHCR":"RSSDHCR",
               "OriginatingLender":"Originating_Lender",
               "OriginatingLenderCity":"Originating_Lender_City",
               "OriginatingLenderLocationID":"Originating_Lender_Location_ID",
               "OriginatingLenderState":"Originating_Lender_State",
               "OriginatingLenderLocationID":"Originating_Lender_Location_ID",
               "CurrentApprovalAmount":"Current_Approval_Amount",
               "ForgivenessAmount":"Forgiveness_Amount",
               "ForgivnessPct":"Forgivness_Pct",
               "NAMEHCR":"Name_HCR",
               "UNINUM":"UNINUM",
               "NM_LGL":"NM_LGL",
               "NM_SHORT":"NM_Short",               
               "CERT":"Cert"
              }
df = df.rename(columns=columns_dict)
df.columns

Index(['Unnamed: 0', 'RSSDHCR', 'Originating_Lender',
       'Originating_Lender_City', 'Originating_Lender_Location_ID',
       'Originating_Lender_State', 'Current_Approval_Amount',
       'Forgiveness_Amount', 'Forgivness_Pct', 'Name_HCR', 'UNINUM', 'NM_LGL',
       'NM_Short', 'Cert'],
      dtype='object')

In [6]:
df = df.drop(columns=["Unnamed: 0"])

In [7]:
df.head()

Unnamed: 0,RSSDHCR,Originating_Lender,Originating_Lender_City,Originating_Lender_Location_ID,Originating_Lender_State,Current_Approval_Amount,Forgiveness_Amount,Forgivness_Pct,Name_HCR,UNINUM,NM_LGL,NM_Short,Cert
0,1020395.0,CCB COMMUNITY BANK,ANDALUSIA,26.0,AL,21420366.01,21224047.72,0.990835,SOUTHERN NATIONAL CORP,10739.0,SOUTHERN NATIONAL CORPORATION ...,SOUTHERN NAT CORP,16595.0
1,1020582.0,WOODTRUST BANK,WISCONSIN RAPIDS,77051.0,WI,31543689.32,31834545.84,1.009221,WOODTRUST FINANCIAL CORP,3599.0,WOODTRUST FINANCIAL CORPORATION ...,WOODTRUST FC,5335.0
2,1020667.0,NORTHWEST BANK & TRUST COMPANY,DAVENPORT,24824.0,IA,26963725.0,26824637.15,0.994842,NORTHWEST INVESTMENT CORP,10071.0,NORTHWEST INVESTMENT CORP. ...,NORTHWEST INV CORP,15830.0
3,1020676.0,AMALGAMATED BANK OF CHICAGO,CHICAGO,20494.0,IL,3049177.89,3036048.99,0.995694,AMALGAMATED INVESTMENTS CO,581.0,AMALGAMATED INVESTMENTS COMPANY ...,AMALGAMATED INV CO,903.0
4,1020854.0,FIRSTBANK OF NEBRASKA,WAHOO,42693.0,NE,10426743.02,10401369.48,0.997566,BANK MANAGEMENT INC,3718.0,"BANK MANAGEMENT, INC. ...",BANK MGMT,5486.0


In [8]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['Originating_Lender',
 'Originating_Lender_City',
 'Originating_Lender_State',
 'Name_HCR',
 'NM_LGL',
 'NM_Short']

In [9]:
for col in strings:
    df[col] = df[col].str.upper()

In [10]:
df.head()

Unnamed: 0,RSSDHCR,Originating_Lender,Originating_Lender_City,Originating_Lender_Location_ID,Originating_Lender_State,Current_Approval_Amount,Forgiveness_Amount,Forgivness_Pct,Name_HCR,UNINUM,NM_LGL,NM_Short,Cert
0,1020395.0,CCB COMMUNITY BANK,ANDALUSIA,26.0,AL,21420366.01,21224047.72,0.990835,SOUTHERN NATIONAL CORP,10739.0,SOUTHERN NATIONAL CORPORATION ...,SOUTHERN NAT CORP,16595.0
1,1020582.0,WOODTRUST BANK,WISCONSIN RAPIDS,77051.0,WI,31543689.32,31834545.84,1.009221,WOODTRUST FINANCIAL CORP,3599.0,WOODTRUST FINANCIAL CORPORATION ...,WOODTRUST FC,5335.0
2,1020667.0,NORTHWEST BANK & TRUST COMPANY,DAVENPORT,24824.0,IA,26963725.0,26824637.15,0.994842,NORTHWEST INVESTMENT CORP,10071.0,NORTHWEST INVESTMENT CORP. ...,NORTHWEST INV CORP,15830.0
3,1020676.0,AMALGAMATED BANK OF CHICAGO,CHICAGO,20494.0,IL,3049177.89,3036048.99,0.995694,AMALGAMATED INVESTMENTS CO,581.0,AMALGAMATED INVESTMENTS COMPANY ...,AMALGAMATED INV CO,903.0
4,1020854.0,FIRSTBANK OF NEBRASKA,WAHOO,42693.0,NE,10426743.02,10401369.48,0.997566,BANK MANAGEMENT INC,3718.0,"BANK MANAGEMENT, INC. ...",BANK MGMT,5486.0


In [11]:
df.isnull().sum()

RSSDHCR                           1996
Originating_Lender                   0
Originating_Lender_City              0
Originating_Lender_Location_ID       0
Originating_Lender_State             0
Current_Approval_Amount              0
Forgiveness_Amount                   0
Forgivness_Pct                       0
Name_HCR                          1996
UNINUM                            1996
NM_LGL                            2073
NM_Short                          2073
Cert                              1443
dtype: int64

In [12]:
# Fill any remaining nulls with "Not Available" - only for string columns
for col in strings:
    df[col] = df[col].fillna('Not Available')

In [13]:
df.head()

Unnamed: 0,RSSDHCR,Originating_Lender,Originating_Lender_City,Originating_Lender_Location_ID,Originating_Lender_State,Current_Approval_Amount,Forgiveness_Amount,Forgivness_Pct,Name_HCR,UNINUM,NM_LGL,NM_Short,Cert
0,1020395.0,CCB COMMUNITY BANK,ANDALUSIA,26.0,AL,21420366.01,21224047.72,0.990835,SOUTHERN NATIONAL CORP,10739.0,SOUTHERN NATIONAL CORPORATION ...,SOUTHERN NAT CORP,16595.0
1,1020582.0,WOODTRUST BANK,WISCONSIN RAPIDS,77051.0,WI,31543689.32,31834545.84,1.009221,WOODTRUST FINANCIAL CORP,3599.0,WOODTRUST FINANCIAL CORPORATION ...,WOODTRUST FC,5335.0
2,1020667.0,NORTHWEST BANK & TRUST COMPANY,DAVENPORT,24824.0,IA,26963725.0,26824637.15,0.994842,NORTHWEST INVESTMENT CORP,10071.0,NORTHWEST INVESTMENT CORP. ...,NORTHWEST INV CORP,15830.0
3,1020676.0,AMALGAMATED BANK OF CHICAGO,CHICAGO,20494.0,IL,3049177.89,3036048.99,0.995694,AMALGAMATED INVESTMENTS CO,581.0,AMALGAMATED INVESTMENTS COMPANY ...,AMALGAMATED INV CO,903.0
4,1020854.0,FIRSTBANK OF NEBRASKA,WAHOO,42693.0,NE,10426743.02,10401369.48,0.997566,BANK MANAGEMENT INC,3718.0,"BANK MANAGEMENT, INC. ...",BANK MGMT,5486.0


In [14]:
df.to_csv('../data/cleaned/lender_data/lender_data.csv', encoding="utf-8")