In [1]:
from pyspark.sql import SparkSession
 
spark=SparkSession.\
builder.\
config('spark.shuffle.useOldFetchProtocol','true').\
config('spark.ui.port','0').\
config("spark.sql.warehouse.dir","/user/itv016478/warehouse").\
enableHiveSupport().\
master('yarn').\
getOrCreate()

In [2]:
loans_repay_raw_df= spark.read \
.format("csv") \
.option("header",True) \
.option("inferSchema", True) \
.load("/user/itv016478/lendingclubproject/raw/loans_repayments_csv")

In [3]:
loans_repay_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_rec_prncp: string (nullable = true)
 |-- total_rec_int: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- total_pymnt: string (nullable = true)
 |-- last_pymnt_amnt: string (nullable = true)
 |-- last_pymnt_d: string (nullable = true)
 |-- next_pymnt_d: string (nullable = true)



In [4]:
loans_repay_schema = "loan_id string, total_principal_received float, total_interest_received float,total_late_fee_received float, total_payment_received float, last_payment_amount float, last_payment_date string, next_payment_date string"

In [5]:
loans_repay_schema

'loan_id string, total_principal_received float, total_interest_received float,total_late_fee_received float, total_payment_received float, last_payment_amount float, last_payment_date string, next_payment_date string'

In [6]:
loans_repay_df= spark.read \
.format("csv") \
.option("header",True) \
.schema(loans_repay_schema) \
.load("/user/itv016478/lendingclubproject/raw/loans_repayments_csv")

In [7]:
loans_repay_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)



In [8]:
from pyspark.sql.functions import current_timestamp

In [9]:
loans_repay_ingest=loans_repay_df.withColumn("ingest_date",current_timestamp())

In [10]:
loans_repay_ingest

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
76003861,6423.8,5607.85,0.0,12031.65,574.88,Jan-2018,,2025-06-15 05:44:...
76263914,2400.0,323.18,0.0,2723.1802,905.18,Dec-2017,,2025-06-15 05:44:...
75537401,12600.0,1392.76,0.0,13992.757,3437.74,Aug-2018,,2025-06-15 05:44:...
75038986,7805.03,7583.71,0.0,15388.74,440.72,Mar-2019,Apr-2019,2025-06-15 05:44:...
76301424,2614.78,1080.09,0.0,3694.87,153.89,Apr-2018,,2025-06-15 05:44:...
75333198,8664.09,3431.32,68.8,12164.21,343.9,Mar-2019,Apr-2019,2025-06-15 05:44:...
76391453,35000.0,10266.01,0.0,45266.008,18884.35,Feb-2019,,2025-06-15 05:44:...
76363364,15000.0,2093.11,0.0,17093.107,4648.59,Aug-2018,,2025-06-15 05:44:...
76272510,14463.21,11178.79,110.13,25752.13,734.18,Mar-2019,Apr-2019,2025-06-15 05:44:...
76304116,4800.0,1051.65,0.0,5851.6523,1761.59,Dec-2017,,2025-06-15 05:44:...


In [11]:
loans_repay_ingest.createOrReplaceTempView("loans_repayment")

In [12]:
spark.sql("select * from loans_repayment ")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
76003861,6423.8,5607.85,0.0,12031.65,574.88,Jan-2018,,2025-06-15 05:44:...
76263914,2400.0,323.18,0.0,2723.1802,905.18,Dec-2017,,2025-06-15 05:44:...
75537401,12600.0,1392.76,0.0,13992.757,3437.74,Aug-2018,,2025-06-15 05:44:...
75038986,7805.03,7583.71,0.0,15388.74,440.72,Mar-2019,Apr-2019,2025-06-15 05:44:...
76301424,2614.78,1080.09,0.0,3694.87,153.89,Apr-2018,,2025-06-15 05:44:...
75333198,8664.09,3431.32,68.8,12164.21,343.9,Mar-2019,Apr-2019,2025-06-15 05:44:...
76391453,35000.0,10266.01,0.0,45266.008,18884.35,Feb-2019,,2025-06-15 05:44:...
76363364,15000.0,2093.11,0.0,17093.107,4648.59,Aug-2018,,2025-06-15 05:44:...
76272510,14463.21,11178.79,110.13,25752.13,734.18,Mar-2019,Apr-2019,2025-06-15 05:44:...
76304116,4800.0,1051.65,0.0,5851.6523,1761.59,Dec-2017,,2025-06-15 05:44:...


In [13]:
spark.sql("select count(*) from loans_repayment ")

count(1)
2260701


In [14]:
#1)  check null values in "total_principal_received" and remove null values
spark.sql("select count(*) from loans_repayment where total_principal_received is null ")

count(1)
69


In [15]:
columns_to_check = ["total_principal_received", "total_interest_received","total_late_fee_received","total_payment_received", "last_payment_amount"]

In [16]:
loans_repay_filtered_df = loans_repay_ingest.na.drop(subset=columns_to_check)

In [17]:
loans_repay_filtered_df.count()

2260498

In [18]:
#2)  Check where  "total_payment_received" is 0.0 
loans_repay_filtered_df.createOrReplaceTempView("loans_repayment")
spark.sql ("select * from loans_repayment where total_payment_received=0.0")


loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
75427271,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
141017398,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140973407,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140618570,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140999435,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140946099,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140894966,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140759457,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
139697300,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140817236,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...


In [19]:
# Check the count where  "total_payment_received" is 0.0
spark.sql ("select count(*) from loans_repayment where total_payment_received=0.0")

count(1)
995


In [20]:
# 3) Check where  "total_payment_received" is 0.0 but "total_principal_received" is no equal to 0
spark.sql ("select * from loans_repayment where total_payment_received=0.0 and total_principal_received!=0.0" )

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
1064185,11600.98,11600.98,10000.0,0.0,0.0,0.0,Dec-2014,2025-06-15 05:44:...
516382,21890.229,21856.03,16000.0,0.0,0.0,0.0,Mar-2014,2025-06-15 05:44:...
528899,3045.0364,3019.64,2500.0,0.0,0.0,0.0,Jan-2013,2025-06-15 05:44:...
527598,2398.9092,2220.51,2200.0,0.0,0.0,0.0,Jul-2011,2025-06-15 05:44:...
525697,21797.86,19894.9,15750.0,0.0,0.0,0.0,Jun-2015,2025-06-15 05:44:...
522641,3146.8193,3146.82,3000.0,0.0,0.0,0.0,Sep-2011,2025-06-15 05:44:...
515655,29938.576,29905.75,22800.0,0.0,0.0,0.0,May-2013,2025-06-15 05:44:...
501234,15219.313,15155.9,12000.0,0.0,0.0,0.0,May-2013,2025-06-15 05:44:...
498194,11642.714,11031.47,10000.0,0.0,0.0,0.0,Jan-2013,2025-06-15 05:44:...
495171,11138.843,10024.96,10000.0,0.0,0.0,0.0,Apr-2013,2025-06-15 05:44:...


In [21]:
# Check the count where "total_payment_received" is 0.0 but "total_principal_received" is no equal to 0
spark.sql ("select count(*) from loans_repayment where total_payment_received=0.0 and total_principal_received!=0.0" )

count(1)
46


In [22]:
from pyspark.sql.functions import when,col

In [23]:
#4) Making a logic that "total_principal_received" + "total_interest_received"+ "total_late_fee_received" = "total_payment_received"
#where  "total_payment_received" is 0.0 but "total_principal_received" is no equal to 0 
loans_payments_fixed_df = loans_repay_filtered_df.withColumn("total_payment_received",
when(
    (col("total_principal_received")!=0.0)&      #if this 2 condition met
    (col("total_payment_received")==0.0),        #if this 2 condition met
col("total_principal_received")+ col("total_interest_received")+ col("total_late_fee_received")  #replace the value by this 
).otherwise(col("total_payment_received"))   #otherwise by this
)

In [24]:
loans_payments_fixed_df

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
76003861,6423.8,5607.85,0.0,12031.65,574.88,Jan-2018,,2025-06-15 05:44:...
76263914,2400.0,323.18,0.0,2723.1802,905.18,Dec-2017,,2025-06-15 05:44:...
75537401,12600.0,1392.76,0.0,13992.757,3437.74,Aug-2018,,2025-06-15 05:44:...
75038986,7805.03,7583.71,0.0,15388.74,440.72,Mar-2019,Apr-2019,2025-06-15 05:44:...
76301424,2614.78,1080.09,0.0,3694.87,153.89,Apr-2018,,2025-06-15 05:44:...
75333198,8664.09,3431.32,68.8,12164.21,343.9,Mar-2019,Apr-2019,2025-06-15 05:44:...
76391453,35000.0,10266.01,0.0,45266.008,18884.35,Feb-2019,,2025-06-15 05:44:...
76363364,15000.0,2093.11,0.0,17093.107,4648.59,Aug-2018,,2025-06-15 05:44:...
76272510,14463.21,11178.79,110.13,25752.13,734.18,Mar-2019,Apr-2019,2025-06-15 05:44:...
76304116,4800.0,1051.65,0.0,5851.6523,1761.59,Dec-2017,,2025-06-15 05:44:...


In [25]:
#5) Now to check the previous query 4 is working or not . Let's take loan_id =1064185 , total_payment_received was 0.0  before (in 3 query) ,lets see now
loans_payments_fixed_df.filter("loan_id == '1064185'")
#Now you can see  for loan_id == '1064185 , total_payment_received is 33201.96

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
1064185,11600.98,11600.98,10000.0,33201.96,0.0,0.0,Dec-2014,2025-06-15 05:44:...


In [26]:
#6) Still there is some data where "total_payment_received" = 0.0  bcoz all the inputs are 0.
loans_payments_fixed_df.filter("total_payment_received==0.0")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
75427271,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
141017398,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140973407,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140618570,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140999435,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140946099,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140894966,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140759457,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
139697300,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...
140817236,0.0,0.0,0.0,0.0,0.0,,,2025-06-15 05:44:...


In [27]:
#So lets divide those empty (0.0 values) from our df. For that use :
loans_payments_final_df=loans_payments_fixed_df.filter("total_payment_received!=0.0")
loans_payments_final_df

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
76003861,6423.8,5607.85,0.0,12031.65,574.88,Jan-2018,,2025-06-15 05:44:...
76263914,2400.0,323.18,0.0,2723.1802,905.18,Dec-2017,,2025-06-15 05:44:...
75537401,12600.0,1392.76,0.0,13992.757,3437.74,Aug-2018,,2025-06-15 05:44:...
75038986,7805.03,7583.71,0.0,15388.74,440.72,Mar-2019,Apr-2019,2025-06-15 05:44:...
76301424,2614.78,1080.09,0.0,3694.87,153.89,Apr-2018,,2025-06-15 05:44:...
75333198,8664.09,3431.32,68.8,12164.21,343.9,Mar-2019,Apr-2019,2025-06-15 05:44:...
76391453,35000.0,10266.01,0.0,45266.008,18884.35,Feb-2019,,2025-06-15 05:44:...
76363364,15000.0,2093.11,0.0,17093.107,4648.59,Aug-2018,,2025-06-15 05:44:...
76272510,14463.21,11178.79,110.13,25752.13,734.18,Mar-2019,Apr-2019,2025-06-15 05:44:...
76304116,4800.0,1051.65,0.0,5851.6523,1761.59,Dec-2017,,2025-06-15 05:44:...


In [28]:
# 7) check where last_payment_date==0.0 (this should not be the case ,there should be date or null)
loans_payments_final_df.filter("last_payment_date==0.0").count()

48

In [29]:
#7) Second method by spark sql
loans_payments_final_df.createOrReplaceTempView("loans")
spark.sql("Select count(*)from loans where last_payment_date==0.0 ")

count(1)
48


In [30]:
#8)  check where next_payment_date==0.0 (this should not be the case ,there should be date or null)
loans_payments_final_df.filter("next_payment_date==0.0").count()

24

In [31]:
# 7 points  want to make nulls 
loans_payments_ldate_df = loans_payments_final_df.withColumn("last_payment_date",
when(
    (col("last_payment_date")==0.0),     
    None)
    .otherwise(col("last_payment_date"))
)
     



In [32]:
loans_payments_ldate_df.filter("last_payment_date==0.0 ").count()

0

In [33]:
# 8 points  want to make nulls 
loans_payments_ndate_df = loans_payments_final_df.withColumn("next_payment_date",
when(
    (col("next_payment_date")==0.0),     
    None)
    .otherwise(col("next_payment_date"))
)
     

In [34]:
loans_payments_ndate_df.filter("next_payment_date==0.0 ").count()

0

In [35]:
# 9) "loans_payments_ndate_df" this is the final df . Now write this df 
loans_payments_ndate_df.write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv016478/lendingclubproject/cleaned/loans_repayments_csv/") \
.save()

In [36]:
# 9) "loans_payments_ndate_df" this is the final df . Now write this df 
loans_payments_ndate_df.write \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv016478/lendingclubproject/cleaned/loans_repayments_parquet/") \
.save()