### Access Patterns (Old data) and Slow Access(New Data)

In [1]:
from pyspark.sql import SparkSession
import getpass 
username=getpass.getuser()
spark=SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

### Single view of 5 tables
- most recent data (it will be slow for query)
- view updated every 24 hours 
- source of data is the cleaned data 
- the data will be stored on hive 

In [10]:
# recent data 

spark.sql("""
create or replace view itv017499_lending_club.customers_loan_view as select
l.loan_id,
c.member_id,
c.emp_title,
c.emp_length,
c.home_ownership,
c.annual_income,
c.address_state,
c.address_zipcode,
c.address_country,
c.grade,
c.sub_grade,
c.verification_status,
c.total_high_credit_limit,
c.application_type,
c.join_annual_income,
c.verification_status_joint,
l.loan_amount,
l.funded_amount,
l.loan_term_years,
l.interest_rate,
l.monthly_installment,
l.issue_date,
l.loan_status,
l.loan_purpose,
r.total_principal_received,
r.total_interest_received,
r.total_late_fee_received,
r.last_payment_date,
r.next_payment_date,
d.delinq_2yrs,
d.delinq_amnt,
d.mths_since_last_delinq,
e.pub_rec,
e.pub_rec_bankruptcies,
e.inq_last_6mths

FROM itv017499_lending_club.customers c
LEFT JOIN itv017499_lending_club.loans l on c.member_id = l.member_id
LEFT JOIN itv017499_lending_club.loans_repayments r ON l.loan_id = r.loan_id
LEFT JOIN itv017499_lending_club.loans_defaulters_delinq d ON c.member_id = d.member_id
LEFT JOIN itv017499_lending_club.loans_defaulters_detail_rec_enq e ON c.member_id = e.member_id
""")

**Note:** Creating a view would be much faster as there is no actual data processing taking place. However, a query to view the data, like the following 

<span style="color:red;">spark.sql("select * from itv017499_lending_club.customers_loan_view limit 5")</span>


This query will take time to execute as it involves joining multiple tables to generate a view with the desired data.

In [11]:
# top 5 rows
spark.sql("select * from itv017499_lending_club.customers_loan_view limit 5")

loan_id,member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,total_principal_received,total_interest_received,total_late_fee_received,last_payment_date,next_payment_date,delinq_2yrs,delinq_amnt,mths_since_last_delinq,pub_rec,pub_rec_bankruptcies,inq_last_6mths
140691162,0001bac8ac76718b9...,,6,RENT,9500.0,NY,112xx,USA,E,E5,Not Verified,14400.0,Individual,,,5500.0,5500.0,3,27.27,225.34,Sep-2018,Current,credit_card,637.38,648.56,0.0,Mar-2019,Apr-2019,,,,0,0,1
28543847,000337b0d91283fd5...,SBA Procesing Spe...,7,RENT,55000.0,CA,900xx,USA,D,D3,Not Verified,48871.0,Individual,,,19800.0,19800.0,3,16.99,705.83,Oct-2014,Charged Off,debt_consolidation,14496.97,5266.27,0.0,Feb-2017,,,,,0,0,0
85270527,00055f1f392f63672...,Stationary Engineer,10,RENT,58000.0,TX,775xx,USA,B,B4,Verified,44803.0,Individual,,,5000.0,5000.0,3,10.99,163.67,Jul-2016,Fully Paid,debt_consolidation,5000.0,6.57,0.0,Jul-2016,,,,,0,0,0
140970584,000fc51dd90fb37c3...,SENIOR CENTER DIR...,6,MORTGAGE,37200.0,IL,604xx,USA,B,B3,Verified,86540.0,Individual,,,17600.0,17600.0,3,11.06,576.71,Sep-2018,Current,debt_consolidation,2111.04,782.82,0.0,Feb-2019,Apr-2019,,,,1,1,0
113502847,0010be54b81d0d885...,Parole Officer,10,RENT,46425.0,TX,788xx,USA,C,C2,Source Verified,38049.0,Individual,,,6000.0,6000.0,3,13.59,203.88,Jul-2017,Fully Paid,debt_consolidation,6000.0,610.18,0.0,May-2018,,,,,0,0,1


### Creating permanent table for weekly job 
- We have a weekly job
- The joins of 5 tables is precalculated and stored in a table in the DB 
- query will be faster 
- even though the results are faster in this case but the data will be week old
- in this case it will be a **managed table**

In [6]:
# weekly job

spark.sql("""
create table itv017499_lending_club.customers_loan_table as select
l.loan_id,
c.member_id,
c.emp_title,
c.emp_length,
c.home_ownership,
c.annual_income,
c.address_state,
c.address_zipcode,
c.address_country,
c.grade,
c.sub_grade,
c.verification_status,
c.total_high_credit_limit,
c.application_type,
c.join_annual_income,
c.verification_status_joint,
l.loan_amount,
l.funded_amount,
l.loan_term_years,
l.interest_rate,
l.monthly_installment,
l.issue_date,
l.loan_status,
l.loan_purpose,
r.total_principal_received,
r.total_interest_received,
r.total_late_fee_received,
r.last_payment_date,
r.next_payment_date,
d.delinq_2yrs,
d.delinq_amnt,
d.mths_since_last_delinq,
e.pub_rec,
e.pub_rec_bankruptcies, 
e.inq_last_6mths

FROM itv006277_lending_club.customers c
LEFT JOIN itv017499_lending_club.loans l on c.member_id = l.member_id
LEFT JOIN itv017499_lending_club.loans_repayments r ON l.loan_id = r.loan_id
LEFT JOIN itv017499_lending_club.loans_defaulters_delinq d ON c.member_id = d.member_id
LEFT JOIN itv017499_lending_club.loans_defaulters_detail_rec_enq e ON c.member_id = e.member_id
""")

**Note:** In this case, a Managed Table is created. The actual data for this table will be stored in the warehouse directory and the metadata is present in the Hive metastore.

In [7]:
# top 5 rows
spark.sql("select * from itv017499_lending_club.customers_loan_table limit 5")

loan_id,member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,total_principal_received,total_interest_received,total_late_fee_received,last_payment_date,next_payment_date,delinq_2yrs,delinq_amnt,mths_since_last_delinq,pub_rec,pub_rec_bankruptcies,inq_last_6mths
88938989,000170b4ccb292792...,Account Executive,10,MORTGAGE,120000.0,NV,891xx,USA,B,B3,Not Verified,397653.0,Individual,,,15000.0,15000.0,3,10.49,487.47,Sep-2016,Fully Paid,home_improvement,15000.0,2268.66,0.0,Sep-2018,,1.0,0.0,0.0,0,0,1
77426808,000db9b641adb3474...,Payroll supervisor,10,MORTGAGE,65000.0,NC,281xx,USA,C,C1,Not Verified,236632.0,Individual,,,12000.0,12000.0,5,11.99,266.88,May-2016,Current,credit_card,5916.22,3137.72,0.0,Mar-2019,Apr-2019,,,,1,1,0
114171122,001560f90a5abac6e...,LPN,4,RENT,40000.0,PA,172xx,USA,B,B1,Not Verified,72353.0,Individual,,,1500.0,1500.0,3,9.44,48.01,Jul-2017,Current,credit_card,780.96,178.45,0.0,Mar-2019,Apr-2019,,,,0,0,0
88705580,0019e762e317c8c52...,Regional Producti...,6,MORTGAGE,88500.0,TX,750xx,USA,C,C1,Verified,584024.0,Individual,,,12100.0,12100.0,3,12.79,406.48,Aug-2016,Fully Paid,debt_consolidation,12100.0,1477.83,0.0,Oct-2017,,,,,0,0,0
1471488,0019ef80deb07754e...,sierra telephone,8,RENT,45000.0,CA,953xx,USA,B,B3,Verified,,Individual,,,18000.0,18000.0,5,12.12,401.5,Aug-2012,Fully Paid,debt_consolidation,18000.0,5274.58,0.0,Nov-2015,,,,,0,0,0
