In [1]:
from pyspark.sql import SparkSession
 
spark=SparkSession.\
builder.\
config('spark.shuffle.useOldFetchProtocol','true').\
config('spark.ui.port','0').\
config("spark.sql.warehouse.dir","/user/itv016478/warehouse").\
enableHiveSupport().\
master('yarn').\
getOrCreate()


In [2]:
# 1.create a df with proper datatypes

In [3]:
Customers_schema= """ member_id string, emp_title string, emp_length string , home_ownership string,
annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, verification_status string, 
tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string"""

In [4]:
customers_df= spark.read \
.format("csv") \
.schema(Customers_schema) \
.option("header","true") \
.load("/user/itv016478/lendingclubproject/raw/customers_data_csv")

In [5]:
customers_df


member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
4480925324607267c...,IT,3 years,RENT,92000.0,NJ,088xx,USA,E,E3,Verified,132523.0,Individual,,
b54711d4a553ea330...,District Manager,7 years,RENT,45000.0,TX,787xx,USA,C,C4,Verified,33247.0,Individual,,
db06b45a938f1a3b5...,Case Management,6 years,RENT,33000.0,NJ,070xx,USA,D,D3,Not Verified,34825.0,Individual,,
ad9d9524477e85c11...,senior financial ...,< 1 year,RENT,80000.0,NJ,079xx,USA,A,A4,Source Verified,81859.0,Individual,,
c67f6ac3fea6ef46d...,Senior Analyst H....,6 years,RENT,30000.0,FL,328xx,USA,B,B3,Source Verified,30825.0,Individual,,
bb36e2cb69517fac3...,Owner Realtor,1 year,MORTGAGE,40000.0,WI,549xx,USA,E,E3,Not Verified,186722.0,Individual,,
af69a7dff814fb213...,nurse,10+ years,MORTGAGE,45000.0,TX,791xx,USA,C,C5,Not Verified,261804.0,Individual,,
c9c794b5025e14a7d...,Assistant IT Dire...,2 years,MORTGAGE,76500.0,UT,840xx,USA,D,D5,Source Verified,67822.0,Individual,,
61b48d763bd82369a...,Network Provision...,3 years,RENT,98000.0,CA,950xx,USA,B,B2,Source Verified,100609.0,Individual,,
adc390ceaa6428ba4...,,,MORTGAGE,38964.0,MO,650xx,USA,C,C2,Source Verified,18400.0,Individual,,


In [6]:
customers_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [7]:
#2 rename few columns: Transformation (withColumnRenamed)

In [8]:
customers_df_renamed = customers_df.withColumnRenamed ("annual_inc", "annual_income") \
.withColumnRenamed ("addr_state", "address_state") \
.withColumnRenamed ("zip_code", "address_zipcode") \
.withColumnRenamed ("country", "address_country") \
.withColumnRenamed ("tot_hi_cred_lim", "total_high_credit_limit") \
.withColumnRenamed ("annual_inc_joint", "join_annual_income") \


In [9]:
customers_df_renamed


member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint
4480925324607267c...,IT,3 years,RENT,92000.0,NJ,088xx,USA,E,E3,Verified,132523.0,Individual,,
b54711d4a553ea330...,District Manager,7 years,RENT,45000.0,TX,787xx,USA,C,C4,Verified,33247.0,Individual,,
db06b45a938f1a3b5...,Case Management,6 years,RENT,33000.0,NJ,070xx,USA,D,D3,Not Verified,34825.0,Individual,,
ad9d9524477e85c11...,senior financial ...,< 1 year,RENT,80000.0,NJ,079xx,USA,A,A4,Source Verified,81859.0,Individual,,
c67f6ac3fea6ef46d...,Senior Analyst H....,6 years,RENT,30000.0,FL,328xx,USA,B,B3,Source Verified,30825.0,Individual,,
bb36e2cb69517fac3...,Owner Realtor,1 year,MORTGAGE,40000.0,WI,549xx,USA,E,E3,Not Verified,186722.0,Individual,,
af69a7dff814fb213...,nurse,10+ years,MORTGAGE,45000.0,TX,791xx,USA,C,C5,Not Verified,261804.0,Individual,,
c9c794b5025e14a7d...,Assistant IT Dire...,2 years,MORTGAGE,76500.0,UT,840xx,USA,D,D5,Source Verified,67822.0,Individual,,
61b48d763bd82369a...,Network Provision...,3 years,RENT,98000.0,CA,950xx,USA,B,B2,Source Verified,100609.0,Individual,,
adc390ceaa6428ba4...,,,MORTGAGE,38964.0,MO,650xx,USA,C,C2,Source Verified,18400.0,Individual,,


In [10]:
# 3. Insert a new col named as ingestion date (current time)

In [11]:
from pyspark.sql.functions import current_timestamp

In [12]:
customers_df_ingested= customers_df_renamed.withColumn("ingest_date", current_timestamp())

In [13]:
customers_df_ingested

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
4480925324607267c...,IT,3 years,RENT,92000.0,NJ,088xx,USA,E,E3,Verified,132523.0,Individual,,,2025-06-15 05:48:...
b54711d4a553ea330...,District Manager,7 years,RENT,45000.0,TX,787xx,USA,C,C4,Verified,33247.0,Individual,,,2025-06-15 05:48:...
db06b45a938f1a3b5...,Case Management,6 years,RENT,33000.0,NJ,070xx,USA,D,D3,Not Verified,34825.0,Individual,,,2025-06-15 05:48:...
ad9d9524477e85c11...,senior financial ...,< 1 year,RENT,80000.0,NJ,079xx,USA,A,A4,Source Verified,81859.0,Individual,,,2025-06-15 05:48:...
c67f6ac3fea6ef46d...,Senior Analyst H....,6 years,RENT,30000.0,FL,328xx,USA,B,B3,Source Verified,30825.0,Individual,,,2025-06-15 05:48:...
bb36e2cb69517fac3...,Owner Realtor,1 year,MORTGAGE,40000.0,WI,549xx,USA,E,E3,Not Verified,186722.0,Individual,,,2025-06-15 05:48:...
af69a7dff814fb213...,nurse,10+ years,MORTGAGE,45000.0,TX,791xx,USA,C,C5,Not Verified,261804.0,Individual,,,2025-06-15 05:48:...
c9c794b5025e14a7d...,Assistant IT Dire...,2 years,MORTGAGE,76500.0,UT,840xx,USA,D,D5,Source Verified,67822.0,Individual,,,2025-06-15 05:48:...
61b48d763bd82369a...,Network Provision...,3 years,RENT,98000.0,CA,950xx,USA,B,B2,Source Verified,100609.0,Individual,,,2025-06-15 05:48:...
adc390ceaa6428ba4...,,,MORTGAGE,38964.0,MO,650xx,USA,C,C2,Source Verified,18400.0,Individual,,,2025-06-15 05:48:...


In [14]:
#4 Remove duplicate rows

In [15]:
customers_df_ingested.count()

2260701

In [16]:
customers_distinct = customers_df_ingested.distinct()

In [17]:
customers_distinct.count()

2260638

In [18]:
spark.sql("CREATE DATABASE IF NOT EXISTS itv016478_lending_club")

In [19]:

customers_distinct.createOrReplaceTempView("customers")


In [20]:
spark.sql("Select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
28be5324b9e68cfc6...,Mavhinist,10+ years,RENT,80000.0,IN,462xx,USA,C,C3,Not Verified,32529.0,Individual,,,2025-06-15 05:49:...
4d42a4827a5e977cb...,GM,6 years,OWN,500000.0,FL,339xx,USA,E,E2,Source Verified,189333.0,Individual,,,2025-06-15 05:49:...
33c8e874359e3fa47...,Project Manager,6 years,MORTGAGE,72000.0,NC,276xx,USA,E,E5,Source Verified,228305.0,Individual,,,2025-06-15 05:49:...
c63a9b0abb0c46980...,Merchandise Plann...,3 years,RENT,128000.0,OH,432xx,USA,B,B2,Not Verified,94327.0,Individual,,,2025-06-15 05:49:...
9368aa4ce1b413178...,Police Officer,10+ years,MORTGAGE,73000.0,WI,532xx,USA,A,A1,Not Verified,285236.0,Individual,,,2025-06-15 05:49:...
4d37c66917542ca94...,IT program system...,8 years,MORTGAGE,67000.0,LA,703xx,USA,C,C4,Source Verified,159599.0,Individual,,,2025-06-15 05:49:...
d85e644d1d8ce1170...,,,RENT,30000.0,PA,176xx,USA,C,C2,Verified,47395.0,Joint App,88000.0,Not Verified,2025-06-15 05:49:...
d93e67e2bfffd921f...,Legal Administrat...,< 1 year,RENT,70000.0,CT,066xx,USA,D,D3,Source Verified,13112.0,Individual,,,2025-06-15 05:49:...
7d391e800f674ab35...,AccountinG aSSISTANT,4 years,OWN,80000.0,VA,221xx,USA,B,B1,Verified,68024.0,Individual,,,2025-06-15 05:49:...
4bc776619da818f37...,Vice President,4 years,RENT,192000.0,CA,910xx,USA,C,C5,Verified,43500.0,Individual,,,2025-06-15 05:49:...


In [21]:
#5 Remove rows where annual_income is null

In [22]:
spark.sql("select * from customers where annual_income is null ")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
0ee5cb81634c742f5...,,< 1 year,NONE,,NY,100xx,USA,A,A3,Not Verified,,Individual,,,2025-06-15 05:49:...
f341fdcba660ec6cd...,,< 1 year,NONE,,NY,100xx,USA,A,A2,Not Verified,,Individual,,,2025-06-15 05:49:...
4f45e479dc69d7de7...,"""Coil Winder """"B""""","reactors""",2 years,,531xx,Other,USA,B,B4,38000.0,0.0,1.0,,,2025-06-15 05:49:...
e3b0c44298fc1c149...,,,,,,,USA,,,,,,,,2025-06-15 05:49:...
08035ee25713249d8...,,< 1 year,NONE,,NY,100xx,USA,A,A5,Not Verified,,Individual,,,2025-06-15 05:49:...


In [23]:
customers_income_filtered= spark.sql ("select * from customers where annual_income is not null ")

In [24]:
#now in this customers table :no null annual income is there (removed)
customers_income_filtered.createOrReplaceTempView("customers")

In [25]:
#6) convert emp_length to integer

In [26]:
spark.sql("select distinct(emp_length)from customers").show()

+----------+
|emp_length|
+----------+
|   5 years|
|   9 years|
|      null|
|    1 year|
|   2 years|
|   7 years|
|   8 years|
|   4 years|
|   6 years|
|   3 years|
| 10+ years|
|  < 1 year|
+----------+



In [27]:
from pyspark.sql.functions import regexp_replace, col

In [28]:
#from employment length ,the one with are non digit (yearsmeans string , + < means operator) convert this into blank .
#Only digits with be there in emp_length column. No operator and strings (\D) : Not digit
customers_emplength_cleaned= customers_income_filtered.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)"," "))

In [29]:
customers_emplength_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
ee8b186f64073ac1d...,Owner,6,RENT,75000.0,VA,221xx,USA,E,E2,Verified,50487.0,Individual,,,2025-06-15 05:49:...
d8ffb8c15cc54d5a2...,Press Operator,10,MORTGAGE,46000.0,PA,150xx,USA,C,C4,Source Verified,85526.0,Individual,,,2025-06-15 05:49:...
f99beed35272751e5...,Director Product ...,10,RENT,185000.0,CA,948xx,USA,D,D5,Source Verified,56259.0,Individual,,,2025-06-15 05:49:...
43f521e9e0fddf9a5...,General Manager,2,MORTGAGE,110000.0,MA,027xx,USA,E,E1,Verified,281999.0,Individual,,,2025-06-15 05:49:...
b5e3d6f653305d9e9...,Senior Customer A...,10,MORTGAGE,46000.0,TN,381xx,USA,E,E3,Verified,132354.0,Individual,,,2025-06-15 05:49:...
0a2879c1d431834a0...,Installation Manager,1,RENT,36000.0,NC,275xx,USA,C,C3,Not Verified,112215.0,Individual,,,2025-06-15 05:49:...
5572115fdb0f31b57...,Optical Laborator...,3,MORTGAGE,39000.0,WV,251xx,USA,B,B3,Verified,134609.0,Individual,,,2025-06-15 05:49:...
1a175f9a5d90d9f47...,Office Manager,7,MORTGAGE,65000.0,TX,787xx,USA,A,A5,Source Verified,163000.0,Individual,,,2025-06-15 05:49:...
b3e32a1b4d28528e1...,MLT,10,MORTGAGE,60000.0,MN,551xx,USA,B,B4,Source Verified,292284.0,Individual,,,2025-06-15 05:49:...
b06bb412de6752eb6...,Material Coordinator,8,RENT,60000.0,CA,919xx,USA,B,B3,Source Verified,55400.0,Individual,,,2025-06-15 05:49:...


In [30]:
#null is not blank that means the emp_length is string..convert emp_length in integer
#that means "emp_length" column I want to change into this -> df.col name.cast("int") --> customers_emplength_cleaned.emp_length.cast('int')
customers_emplength_casted = customers_emplength_cleaned.withColumn("emp_length",customers_emplength_cleaned.emp_length.cast('int'))

In [31]:
customers_emplength_casted 

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
394a6587cc754a68b...,Talent Acquisitio...,1.0,RENT,52500.0,OR,972xx,USA,B,B3,Not Verified,26372.0,Individual,,,2025-06-15 05:49:...
42e42c823dd2a8709...,,,RENT,13000.0,AL,363xx,USA,A,A5,Not Verified,10700.0,Individual,,,2025-06-15 05:49:...
946c86b94a3fec25c...,Captain/Pilot,10.0,MORTGAGE,330000.0,MN,553xx,USA,C,C5,Source Verified,675903.0,Individual,,,2025-06-15 05:49:...
61a7fe7c6bf52fedf...,detailer,1.0,RENT,35000.0,FL,334xx,USA,A,A2,Not Verified,29600.0,Individual,,,2025-06-15 05:49:...
b7b4ca2a802c5d316...,political Organizer,10.0,OWN,140000.0,IL,606xx,USA,C,C1,Not Verified,189111.0,Individual,,,2025-06-15 05:49:...
5ab72d333f7a9dd46...,Accountant,4.0,MORTGAGE,73000.0,CA,925xx,USA,B,B4,Source Verified,336568.0,Individual,,,2025-06-15 05:49:...
1ce8ead1d7f1d0d1e...,,,OWN,11172.0,CA,919xx,USA,E,E3,Verified,7000.0,Individual,,,2025-06-15 05:49:...
90f35f86dd9f8a994...,,,RENT,21000.0,NY,100xx,USA,B,B2,Verified,17500.0,Individual,,,2025-06-15 05:49:...
aab66f8a900dffc00...,Courier,2.0,RENT,25000.0,IN,471xx,USA,A,A1,Source Verified,11000.0,Individual,,,2025-06-15 05:49:...
d0b81045762feeef4...,Production,10.0,MORTGAGE,50000.0,MI,492xx,USA,C,C4,Verified,17968.0,Joint App,99000.0,Source Verified,2025-06-15 05:49:...


In [32]:
#Emp_length from string to integer now
customers_emplength_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [33]:
#7) Replace all nulls in emp_length column with average emp_length.
customers_emplength_casted.filter("emp_length is null").count()

146903

In [34]:
customers_emplength_casted.createOrReplaceTempView("customers")

In [35]:
# number : 6.98735 , Floor: 6 (only integer value)
customers_avg_empLength= spark.sql ("select floor(avg(emp_length)) as avg_emp_length from customers").collect

In [36]:
print(customers_avg_empLength)

<bound method DataFrame.collect of +--------------+
|avg_emp_length|
+--------------+
|             6|
+--------------+
>


In [37]:
from pyspark.sql.functions import avg

In [38]:
#select(avg(...)).first()[0] extracts the actual float value from the Row.
avg_emp_length = customers_emplength_casted.select(avg("emp_length")).first()[0]

In [39]:
# Fill nulls in emp_length with the average value
customers_filled = customers_emplength_casted.na.fill({"emp_length": avg_emp_length})

In [40]:
customers_filled


member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
2fbdc121a9c31c1f0...,Welder fab,5,MORTGAGE,60000.0,CO,800xx,USA,A,A5,Not Verified,198136.0,Individual,,,2025-06-15 05:49:...
602732bf4c990553c...,Network Analyst,10,OWN,52000.0,TX,782xx,USA,E,E3,Source Verified,192217.0,Individual,,,2025-06-15 05:49:...
16aceed7605917cca...,store manager,10,MORTGAGE,90000.0,MI,481xx,USA,B,B5,Source Verified,334615.0,Individual,,,2025-06-15 05:49:...
dc3d8d91a565bb124...,Finance,5,MORTGAGE,91800.0,OH,441xx,USA,C,C4,Source Verified,174339.0,Individual,,,2025-06-15 05:49:...
44da1d03fc8ced52b...,Bus driver,8,MORTGAGE,35000.0,CA,923xx,USA,E,E5,Source Verified,49940.0,Individual,,,2025-06-15 05:49:...
f87bfd3b323950fd2...,Sales,10,MORTGAGE,55000.0,LA,711xx,USA,C,C1,Verified,34472.0,Individual,,,2025-06-15 05:49:...
afc683064c69e7f93...,PIT LOADER,10,RENT,41000.0,VA,226xx,USA,C,C3,Source Verified,17600.0,Individual,,,2025-06-15 05:49:...
a76e866af2b7a1968...,Technical Support...,7,RENT,60000.0,PA,191xx,USA,C,C4,Source Verified,51389.0,Individual,,,2025-06-15 05:49:...
13dc37828b371e1c9...,Logistics Manager,3,RENT,50000.0,UT,841xx,USA,C,C1,Verified,46221.0,Individual,,,2025-06-15 05:49:...
8fb892bccfd1f1e3f...,Operations Manager,10,RENT,65000.0,WA,980xx,USA,B,B1,Not Verified,42634.0,Individual,,,2025-06-15 05:49:...


In [41]:
#to check nulls
customers_filled.filter("emp_length is null").count()

0

In [42]:
#8) Address_state should be 2 characters only and replace all others with NA


In [43]:
#Making new customers table
spark.sql("CREATE DATABASE IF NOT EXISTS  itv016478_lending_club")
customers_filled.createOrReplaceTempView("customers")
spark.sql("select distinct (address_state)from customers")

address_state
Helping Kenya's D...
223xx
175 (total projec...
SC
AZ
"so Plan """"C"""" is ..."
I am 56 yrs. old ...
financially I mad...
but no one will l...
LA


In [44]:
#checking for how many rows ,data is messed up (containing too many characters)
spark.sql("select count(address_state)from customers where length(address_state)>2")

count(address_state)
254


In [45]:
from pyspark.sql.functions import when,col,length

In [46]:


customers_state_cleaned = customers_filled.withColumn(
    "address_state",
    when(length(col("address_state")) > 2, "NA").otherwise(col("address_state"))
)

In [47]:
customers_state_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
5393aa6d8d795d3ca...,"MTA, Inc.",3,MORTGAGE,80000.0,AL,357xx,USA,C,C1,Not Verified,73836.0,Individual,,,2025-06-15 05:50:...
f22af4408f090e2db...,State of California,10,MORTGAGE,120000.0,CA,921xx,USA,E,E4,Verified,59850.0,Individual,,,2025-06-15 05:50:...
a97f63e95936e8cac...,Spansion LLC,10,RENT,175000.0,TX,786xx,USA,B,B4,Verified,67002.0,Individual,,,2025-06-15 05:50:...
a74b1e59d5b0ba953...,Kforce Inc,2,RENT,100000.0,NY,113xx,USA,B,B4,Verified,47878.0,Individual,,,2025-06-15 05:50:...
f8f853b0905669c3f...,,6,MORTGAGE,59530.0,IN,462xx,USA,C,C4,Verified,118136.0,Individual,,,2025-06-15 05:50:...
9073b37d7cf8c2061...,DHS State of Oregon,10,MORTGAGE,65000.0,OR,973xx,USA,B,B4,Verified,187690.0,Individual,,,2025-06-15 05:50:...
938c8dfc214e16b73...,Calfrac Well Serv...,1,OWN,110000.0,GA,305xx,USA,B,B4,Source Verified,36120.0,Individual,,,2025-06-15 05:50:...
d5b8590d7bc963a32...,Hills Materials C...,8,OWN,34000.0,SD,577xx,USA,C,C3,Not Verified,19553.0,Individual,,,2025-06-15 05:50:...
effbac92deadac6d9...,South Carolina El...,4,MORTGAGE,67000.0,SC,290xx,USA,B,B2,Verified,257450.0,Individual,,,2025-06-15 05:50:...
032ee832a8e7ca37c...,JPMorgan Chase,5,RENT,107250.0,WA,980xx,USA,A,A4,Not Verified,116548.0,Individual,,,2025-06-15 05:50:...


In [48]:
customers_state_distinct = customers_state_cleaned.select ("address_state").distinct()

In [49]:
customers_state_distinct

address_state
SC
AZ
LA
MN
NJ
DC
OR
""
VA
""


In [50]:
# 9 ) writing the cleaned customers data into  "cleaned" folder in hdfs (in parquet)
customers_state_cleaned.write \
.format ("parquet") \
.mode("overwrite") \
.option("path","/user/itv016478/lendingclubproject/cleaned/customers_parquet/") \
.save()

In [51]:
 # writing the cleaned customers data into  "cleaned" folder in hdfs (in csv if required)
customers_state_cleaned.write \
.format ("csv") \
.option("header",True) \
.mode("overwrite") \
.option("path","/user/itv016478/lendingclubproject/cleaned/customers_csv/") \
.save()