## Creating External tables

In [1]:
from pyspark.sql import SparkSession
import getpass 
username=getpass.getuser()
spark=SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
customers_df = spark.read \
.format("parquet") \
.load("/public/trendytech/lendingclubproject/cleaned/customers_parquet")

In [4]:
# creating new database 
spark.sql("create database itv017499_lending_club")

### cusotmers table

In [6]:
# create cusotmers table
spark.sql("""
    CREATE EXTERNAL TABLE itv017499_lending_club.customers (
        member_id STRING, 
        emp_title STRING, 
        emp_length INT, 
        home_ownership STRING, 
        annual_income FLOAT, 
        address_state STRING, 
        address_zipcode STRING, 
        address_country STRING, 
        grade STRING, 
        sub_grade STRING, 
        verification_status STRING, 
        total_high_credit_limit FLOAT, 
        application_type STRING, 
        join_annual_income FLOAT, 
        verification_status_joint STRING, 
        ingest_date TIMESTAMP
    )
    STORED AS PARQUET 
    LOCATION '/public/trendytech/lendingclubproject/cleaned/customers_parquet'
""")


In [17]:
# top 5 rows
spark.sql("select * from itv017499_lending_club.customers limit 5")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
de8d40bb549f3e676...,Slot Club Lead,10,RENT,42000.0,CO,800xx,USA,A,A5,Source Verified,19200.0,Individual,,,2023-09-15 17:18:...
cb2bfe6ba8d8040ea...,Development Manager,1,MORTGAGE,120000.0,NJ,080xx,USA,B,B2,Verified,521247.0,Individual,,,2023-09-15 17:18:...
fd6cd0f257e376951...,Chef/Baker,5,RENT,25000.0,GA,310xx,USA,E,E2,Source Verified,18200.0,Individual,,,2023-09-15 17:18:...
46a7d1486ffcd4024...,Service Manager,3,MORTGAGE,122000.0,NY,113xx,USA,C,C3,Verified,232468.0,Individual,,,2023-09-15 17:18:...
ec70891c26a58aad8...,Maintenance Manager,1,RENT,35000.0,CO,804xx,USA,C,C1,Verified,16200.0,Individual,,,2023-09-15 17:18:...


### loans table

In [8]:
# creating loans table
spark.sql("""
    CREATE EXTERNAL TABLE itv017499_lending_club.loans (
        loan_id STRING, 
        member_id STRING, 
        loan_amount FLOAT, 
        funded_amount FLOAT, 
        loan_term_years INTEGER, 
        interest_rate FLOAT, 
        monthly_installment FLOAT, 
        issue_date STRING, 
        loan_status STRING, 
        loan_purpose STRING, 
        loan_title STRING, 
        ingest_date TIMESTAMP
    )
    STORED AS PARQUET 
    LOCATION '/public/trendytech/lendingclubproject/cleaned/loans_parquet'
""")


In [18]:
# top 5 rows
spark.sql("select * from itv017499_lending_club.loans limit 5")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
56633077,b59d80da191f5b573...,3000.0,3000.0,3,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2023-09-18 18:39:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,3,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2023-09-18 18:39:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,3,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2023-09-18 18:39:...
56463188,e12aefc548f750777...,11200.0,11200.0,5,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2023-09-18 18:39:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,5,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2023-09-18 18:39:...


### loans repayments table

In [10]:
# creating loans_repayments table
spark.sql("""
    CREATE EXTERNAL TABLE itv017499_lending_club.loans_repayments (
        loan_id STRING, 
        total_principal_received FLOAT, 
        total_interest_received FLOAT, 
        total_late_fee_received FLOAT, 
        total_payment_received FLOAT, 
        last_payment_amount FLOAT, 
        last_payment_date STRING, 
        next_payment_date STRING, 
        ingest_date TIMESTAMP
    )
    STORED AS PARQUET 
    LOCATION '/public/trendytech/lendingclubproject/cleaned/loans_repayments_parquet'
""")

In [19]:
# top 5 rows
spark.sql("select * from itv017499_lending_club.loans_repayments limit 5")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
28674390,7884.28,5178.41,0.0,14329.62,482.23,Nov-2016,,2023-09-19 05:25:...
28563513,9000.0,955.61,0.0,9955.61,6420.29,Nov-2015,,2023-09-19 05:25:...
28694706,22125.0,751.39,0.0,22876.39,22073.85,Dec-2014,,2023-09-19 05:25:...
28722876,3196.23,1259.77,0.0,4639.79,178.24,Nov-2016,,2023-09-19 05:25:...
28644402,26400.0,2150.7,0.0,28550.7,19292.99,Oct-2015,,2023-09-19 05:25:...


### loans_defaulters_delinq table

In [12]:
#creating loans_defaulters_delinq table
spark.sql("""
    CREATE EXTERNAL TABLE itv017499_lending_club.loans_defaulters_delinq (
        member_id STRING, 
        delinq_2yrs INTEGER, 
        delinq_amnt FLOAT, 
        mths_since_last_delinq INTEGER
    )
    STORED AS PARQUET 
    LOCATION '/public/trendytech/lendingclubproject/cleaned/loans_defaulters_delinq_parquet'
""")

In [21]:
# top 5 rows
spark.sql("select * from itv017499_lending_club.loans_defaulters_delinq limit 5")

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq
2163f48a5b1c49f51...,4,0.0,0
b0b8000b6e4dc620e...,2,0.0,0
81d4ca137b1093d71...,1,0.0,0
d35d6ca3d4a1a474e...,0,0.0,15
1d546aec89610c539...,1,0.0,0


### loans_defaulters_detail_rec_enq table

In [14]:
# create loans_defaulters_detail_rec_enq table
spark.sql("""
    CREATE EXTERNAL TABLE itv017499_lending_club.loans_defaulters_detail_rec_enq (
        member_id STRING, 
        pub_rec INTEGER, 
        pub_rec_bankruptcies INTEGER, 
        inq_last_6mths INTEGER
    )
    STORED AS PARQUET 
    LOCATION '/public/trendytech/lendingclubproject/cleaned/loans_defaulters_detail_records_enq_parquet'
""")

In [22]:
# top 5 rows 
spark.sql("select * from itv017499_lending_club.loans_defaulters_detail_rec_enq limit 5")

member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths
673da0da820de70f9...,0,0,1
f5725a82fea842f9b...,0,0,2
0e697722917e34328...,0,0,2
e995ef7b48b9a42db...,0,0,1
a3b9c8c1aaa89b52a...,0,0,1
