## Identifying bad data

In [1]:
# import sparksession
from pyspark.sql import SparkSession
import getpass 
username=getpass.getuser()
spark=SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol', 'true'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

### Reviewing customer table for bad data

In [2]:
# checking repaeating member_id in customer table
spark.sql("""
    select member_id, 
           count(*) as total
    from itv017499_lending_club.customers
    group by member_id 
    order by total desc
""")

member_id,total
e4c167053d5418230...,5
ad8e5d384dae17e06...,4
76b577467eda5bdbc...,4
3f87585a20f702838...,4
819453be77718d747...,3
22593a1870543b2db...,3
035bf3d8288d803bd...,3
066ddaa64bee66dff...,3
291ca1b09ef11ca3e...,3
5d52e7773cb0efff3...,3


In [3]:
# viewing of a repeating member_id
spark.sql("""
    select *
    from itv017499_lending_club.customers
    where member_id like 'e4c167053d5418230%'
""")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,138780.0,Individual,,,2023-09-15 17:18:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,171165.0,Individual,,,2023-09-15 17:18:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,110907.0,Individual,,,2023-09-15 17:18:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,129833.0,Individual,,,2023-09-15 17:18:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,207300.0,Individual,,,2023-09-15 17:18:...


**Note:** The `member_id` `e4c167053d5418230` is associated with multiple values for `total_high_credit_limit`, which is inconsistent with the expected data structure. Ideally, each member should have only one `total_high_credit_limit`. This discrepancy indicates that the data is flawed or incorrect.

### Reviewing Loans defaulter delinquent table for bad data

In [4]:
# checking repaeating member_id in loans_defaulters_delinq table
spark.sql("""
    select member_id, 
           count(*) as total
    from itv017499_lending_club.loans_defaulters_delinq
    group by member_id 
    order by total desc
""")

member_id,total
e4c167053d5418230...,3
55d55d97420671a1c...,2
77db5fdf951dd04b2...,2
f1ebb6edb9b07de5f...,2
6c2b63ff231e520d4...,2
62394e3f9d063413b...,2
c16e1f92ae5ccc785...,2
17d76ba5141e1c33a...,2
cbede54df344cdb94...,2
b5ded5638e54e1692...,2


In [5]:
# viewing indvidual case
spark.sql("""
    select *
    from itv017499_lending_club.loans_defaulters_delinq
    where member_id like 'e4c167053d5418230%'
""")

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq
e4c167053d5418230...,3,0.0,0
e4c167053d5418230...,1,0.0,0
e4c167053d5418230...,1,0.0,45


**Note:** There is ambiguity in the data, as the same member_id is associated with different delinquency values. This inconsistency is observed in the delinq_2yrs column and indicates a flaw in the data. Such discrepancies need to be addressed to ensure data accuracy and reliability.

### Reviewing loans defaulters detail records enquiry table for bad data

In [6]:
# checking repaeating member_id in loans_defaulters_detail_rec_enq table

spark.sql("""
    select member_id, 
           count(*) as total
    from itv017499_lending_club.loans_defaulters_detail_rec_enq
    group by member_id 
    order by total desc
""")

member_id,total
e3b0c44298fc1c149...,33
e4c167053d5418230...,5
3f87585a20f702838...,4
ad8e5d384dae17e06...,4
76b577467eda5bdbc...,4
3ae415acd6bbfaac1...,3
53789bea7edc660ed...,3
035bf3d8288d803bd...,3
059d401bb603d9a80...,3
498bb6b1f0099cb47...,3


In [7]:
# viewing indvidual case
spark.sql("""
    select *
    from itv017499_lending_club.loans_defaulters_detail_rec_enq
    where member_id like 'e4c167053d5418230%'
""")

member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths
e4c167053d5418230...,0,0,0
e4c167053d5418230...,0,0,3
e4c167053d5418230...,0,0,2
e4c167053d5418230...,0,0,1
e4c167053d5418230...,0,0,0


**Note:** The data shows inconsistencies in the `inq_last_6mths` column for the same `member_id` (`e4c167053d5418230`). Different records indicate varying values. These discrepancies suggest flaws in the data, as the inquiry number in the last six months should ideally be consistent for a single customer or member.  

### No of repeating member_ids in 3 dataset
- customer dataset
- loans_defaulters_delinq dataset
- loans_defaulters_detail_rec_enq

#### No of repeating member_ids in customer dataset

In [8]:
# finding member_id repeating more than one time in customer table
bad_data_customer_df = spark.sql("""
    select member_id
    from (
        select member_id, 
               count(*) as total
        from itv017499_lending_club.customers
        group by member_id
        having total > 1
    )
""")

In [9]:
# total distinct no of member id repeating for more than one time
bad_data_customer_df.count()

3157

#### No of repeating member_ids in 3 loans_defaulters_delinq dataset

In [10]:
# finding member_id repeating more than one time in loans_defaulters_delinq table
bad_data_loans_defaulters_delinq_df = spark.sql("""
    select member_id
    from (
        select member_id, 
               count(*) as total
        from itv017499_lending_club.loans_defaulters_delinq
        group by member_id
        having total > 1
    )
""")

In [11]:
# total distinct no of member id repeating for more than one time
bad_data_loans_defaulters_delinq_df.count()

173

#### No of repeating member_ids in 3 loans_defaulters_detail_rec_enq dataset

In [12]:
# finding member_id repeating more than one time in loans_defaulters_detail_rec_enq table

bad_data_loans_defaulters_detail_rec_enq_df = spark.sql("""
    select member_id
    from (
        select member_id, 
               count(*) as total
        from itv017499_lending_club.loans_defaulters_detail_rec_enq
        group by member_id
        having total > 1
    )
""")

In [13]:
# total distinct no of member id repeating for more than one time
bad_data_loans_defaulters_detail_rec_enq_df.count()

3189

### Saving the bad data in the database

In [14]:
# Writing the bad_data_customer_df DataFrame to a single CSV file.
bad_data_customer_df.repartition(1).write \
    .format("csv") \
    .option("header", True) \
    .mode("overwrite") \
    .option("path", "/user/itv017499/lendingclubproject/bad/bad_data_customers") \
    .save()

# Writing the bad_data_loans_defaulters_delinq_df DataFrame to a single CSV file.
bad_data_loans_defaulters_delinq_df.repartition(1).write \
    .format("csv") \
    .option("header", True) \
    .mode("overwrite") \
    .option("path", "/user/itv017499/lendingclubproject/bad/bad_data_loans_defaulters_delinq") \
    .save()

# Writing the bad_data_loans_defaulters_detail_rec_enq_df DataFrame to a single CSV file.
bad_data_loans_defaulters_detail_rec_enq_df.repartition(1).write \
    .format("csv") \
    .option("header", True) \
    .mode("overwrite") \
    .option("path", "/user/itv017499/lendingclubproject/bad/bad_data_loans_defaulters_detail_rec_enq") \
    .save()


In [15]:
# Combining all bad customer data into a single DataFrame.
# Selects the 'member_id' column from each DataFrame containing flawed data: 
# Using the union operation to merge the data into bad_customer_data_df.

bad_customer_data_df = bad_data_customer_df.select("member_id") \
    .union(bad_data_loans_defaulters_delinq_df.select("member_id")) \
    .union(bad_data_loans_defaulters_detail_rec_enq_df.select("member_id"))


In [16]:
# Removing duplicate member_id entries from the combined bad customer data.
bad_customer_data_final_df = bad_customer_data_df.distinct()

# Counting the number of unique member_id entries in the final DataFrame.
bad_customer_data_final_df.count()

3189

In [17]:
# Writing the bad_customer_data_final_df DataFrame to a single CSV file.
# The data is repartitioned to ensure only one CSV file is created.

bad_customer_data_final_df.repartition(1).write \
    .format("csv") \
    .option("header", True) \
    .mode("overwrite") \
    .option("path", "/user/itv017499/lendingclubproject/bad/bad_customer_data_final") \
    .save()

In [18]:
# creating a temp view of bad_customer df
bad_customer_data_final_df.createOrReplaceTempView("bad_data_customer")

#### Saving customers_df into new cleaned_new directory 

In [19]:
# Creating a DataFrame to filter out bad customer data.
customers_df = spark.sql("""
    select *
    from itv017499_lending_club.customers
    where member_id NOT IN (
        select member_id
        from bad_data_customer
    )
""")

# writing clean customers_df into database
# The data will be saved in the specified path in the "cleaned_new" directory.
customers_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv017499/lendingclubproject/raw/cleaned_new/customers_parquet") \
.save()

#### Saving loans_defaulters_delinq_parquet into new cleaned_new directory 

In [20]:
# Filtering the loans_defaulters_delinq table to exclude bad customer data.
# where the member_id is NOT present in the bad_data_customer table.
loans_defaulters_delinq_df = spark.sql("""
    select *
    from itv017499_lending_club.loans_defaulters_delinq
    where member_id NOT IN (
        select member_id
        from bad_data_customer
    )
""")

# Writing the filtered loans_defaulters_delinq DataFrame to a Parquet file.
# The data will be saved in the specified path in the "cleaned_new" directory.
loans_defaulters_delinq_df.write \
    .format("parquet") \
    .mode("overwrite") \
    .option("path", "/user/itv017499/lendingclubproject/raw/cleaned_new/loans_defaulters_delinq_parquet") \
    .save()


#### Saving loans_defaulters_detail_rec_enq_parquet into new cleaned_new directory 

In [21]:
# Filtering the loans_defaulters_detail_rec_enq table to exclude bad customer data.
# The query selects all rows from the loans_defaulters_detail_rec_enq table 
# where the member_id is NOT present in the bad_data_customer table.
loans_defaulters_detail_rec_enq_df = spark.sql("""
    select *
    from itv017499_lending_club.loans_defaulters_detail_rec_enq
    where member_id NOT IN (
        select member_id
        from bad_data_customer
    )
""")

# Writing the filtered loans_defaulters_detail_rec_enq DataFrame to a Parquet file.
# The data will be saved in the specified path in the "cleaned_new" directory.
# If a file already exists at the location, it will be overwritten.
loans_defaulters_detail_rec_enq_df.write \
    .format("parquet") \
    .mode("overwrite") \
    .option("path", "/user/itv017499/lendingclubproject/raw/cleaned_new/loans_defaulters_detail_rec_enq_parquet") \
    .save()


### Creating External tables in hive Database

In [22]:
# Creating an external table named 'customers_new' in the Hive database.
# The table includes various fields like member_id, emp_title, emp_length, etc., 
# to store customer-related information.
# Data will be stored as Parquet format and will reside at the specified location.

spark.sql("""
    create EXTERNAL TABLE itv017499_lending_club.customers_new (
        member_id string, 
        emp_title string, 
        emp_length int, 
        home_ownership string, 
        annual_income float, 
        address_state string, 
        address_zipcode string, 
        address_country string, 
        grade string, 
        sub_grade string, 
        verification_status string, 
        total_high_credit_limit float, 
        application_type string, 
        join_annual_income float, 
        verification_status_joint string, 
        ingest_date timestamp
    )
    stored as parquet
    LOCATION '/public/trendytech/lendingclubproject/cleaned_new/customer_parquet'
""")


In [23]:
# Creating an external table named 'loans_defaulters_delinq_new' in the Hive database.
# The table includes fields such as member_id, delinq_2yrs, delinq_amnt, and mths_since_last_delinq 
# to store information about loan defaulters and their delinquencies.
# Data will be stored as Parquet format and will reside at the specified location.

spark.sql("""
    create EXTERNAL TABLE itv017499_lending_club.loans_defaulters_delinq_new (
        member_id string,
        delinq_2yrs integer,
        delinq_amnt float,
        mths_since_last_delinq integer
    )
    stored as parquet
    LOCATION '/public/trendytech/lendingclubproject/cleaned_new/loan_defaulters_delinq_parquet'
""")


In [24]:
# Creating an external table named 'loans_defaulters_detail_rec_enq_new' in the Hive database.
# The table includes fields such as member_id, pub_rec, pub_rec_bankruptcies, and inq_last_6mths 
# to store detailed information about loan defaulters and their recent inquiries or public records.
# Data will be stored in Parquet format and will reside at the specified location.

spark.sql("""
    create EXTERNAL TABLE itv017499_lending_club.loans_defaulters_detail_rec_enq_new (
        member_id string,
        pub_rec integer,
        pub_rec_bankruptcies integer,
        inq_last_6mths integer
    )
    stored as parquet
    LOCATION '/public/trendytech/lendingclubproject/cleaned_new/loan_defaulters_detail_rec_enq_parquet'
""")


In [25]:
# Query to count the occurrences of each member_id in the customers_new table.
# Groups the data by member_id and calculates the total count for each.
# The results are ordered in descending order of the total count.

spark.sql("""
    select member_id, 
           count(*) as total 
    from itv017499_lending_club.customers_new
    group by member_id
    order by total desc
""")


member_id,total
efce056fa93066e41...,1
192f2ed98cc7a791c...,1
9cd6ca691cc9ddce5...,1
e067955358950b610...,1
5be667efdb763893c...,1
a52cad3dde7f8d22d...,1
719e7d4675e8f4dc7...,1
fbfef687f8420a6eb...,1
4640aa3a3dbf45056...,1
9d1d8e7b72fa2f4ba...,1
