# Question 2: Classification – Home Loan Approval (1.0 mark)

##  Ex1: College

Use loan_sanction_train.csv dataset (in folder HomeLoan) to build a model to predict "Whether a customer will be approved for a loan or not" (Inputs: select suitable features, Output: Loan_Status) 
Then, make new prediction for the new customers in loan_sanction_test.csv (in folder HomeLoan):
- Will they be approved for a loan or not?
Read more information here:
https://www.kaggle.com/datasets/rishikeshkonapure/home-loan-approval/data

First thing to do is start a Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('homeloan').getOrCreate()
spark

In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

In [5]:
# Load training data
data = spark.read.csv('HomeLoan/loan_sanction_train.csv',inferSchema=True,header=True)

In [6]:
data.count()

614

In [7]:
data.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)



In [8]:
data.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y

In [9]:
data.head()

Row(Loan_ID='LP001002', Gender='Male', Married='No', Dependents='0', Education='Graduate', Self_Employed='No', ApplicantIncome=5849, CoapplicantIncome=0.0, LoanAmount=None, Loan_Amount_Term=360, Credit_History=1, Property_Area='Urban', Loan_Status='Y')

In [10]:
for item in data.head():
    print(item)

LP001002
Male
No
0
Graduate
No
5849
0.0
None
360
1
Urban
Y


In [11]:
selected_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 
                     'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 
                     'Loan_Amount_Term', 'Credit_History', 'Property_Area']

df_train = data.select(selected_features + ['Loan_Status'])

In [12]:
#3. Kiểm tra dữ liệu NaN, null
df_train.select([count(when(isnan(c), c)).alias(c) for c in df_train.columns]).toPandas().T

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0


In [13]:
df_train.select([count(when(col(c).isNull(), c)).alias(c) for c in df_train.columns]).toPandas().T

Unnamed: 0,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14
Credit_History,50


In [14]:
df_train = df_train.na.drop()

In [15]:
df_train.select([count(when(col(c).isNull(), c)).alias(c) for c in df_train.columns]).toPandas().T

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0


In [16]:
# StringIndexer for categorical variables
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df_train) 
            for column in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']]

pipeline = Pipeline(stages=indexers)
df_train = pipeline.fit(df_train).transform(df_train)

In [17]:
# OneHotEncoder for categorical variables
encoder = OneHotEncoder(inputCols=['Gender_index', 'Married_index', 'Education_index', 
                                   'Self_Employed_index', 'Property_Area_index'], 
                        outputCols=['Gender_encoded', 'Married_encoded', 'Education_encoded', 
                                    'Self_Employed_encoded', 'Property_Area_encoded'])

df_train = encoder.fit(df_train).transform(df_train)

In [18]:
# Create a vector of features
assembler = VectorAssembler(inputCols=['Gender_encoded', 'Married_encoded', 'Education_encoded', 
                                       'Self_Employed_encoded', 'Property_Area_encoded', 
                                       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 
                                       'Loan_Amount_Term', 'Credit_History'], 
                            outputCol="features_unscaled")

In [19]:
df_train = assembler.transform(df_train)

In [20]:
# StandardScaler for numerical features
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(df_train)
df_train = scalerModel.transform(df_train)

In [21]:
# Select the features and the label
df_train = df_train.select("features", "Loan_Status_index")

In [22]:
df_train.show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|features                                                                                                                                                                                                                |Loan_Status_index|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|[0.4667112318580628,0.7363940924385185,0.502728629429132,0.3988585767552299,-0.8121100785608534,-0.6734972047814934,-0.13782579766678635,-0.02792290913385349,-0.2078722925821958,0.2752543965980117,0.4127662973889253]|1.0              |
|[0.4667112318580628,0.7363940924385185,0.5027286294

In [23]:
train_data, test_data = df_train.randomSplit([0.8, 0.2], seed=42)

## Spark Formatting of Data

In [24]:
train_data.describe().show()

+-------+-------------------+
|summary|  Loan_Status_index|
+-------+-------------------+
|  count|                409|
|   mean|0.31295843520782396|
| stddev| 0.4642654969944527|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [25]:
test_data.describe().show()

+-------+-------------------+
|summary|  Loan_Status_index|
+-------+-------------------+
|  count|                 71|
|   mean|0.28169014084507044|
| stddev|0.45302471050703175|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



## The Classifiers

In [26]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

Create all three models:

In [27]:
# Use mostly defaults to make this comparison "fair"
dtc = DecisionTreeClassifier(labelCol='Loan_Status_index',featuresCol='features')
rfc = RandomForestClassifier(labelCol='Loan_Status_index',featuresCol='features')

Train all three models:

In [28]:
# Train the models (its three models, so it might take some time)
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)

## Model Comparison

Let's compare each of these models!

In [29]:
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)

### Evaluation Metrics:

In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Loan_Status_index", 
                                                  predictionCol="prediction", 
                                                  metricName="accuracy")

In [32]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

In [33]:
print("Results:")
print('-'*80)
print('A single decision tree - accuracy: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble - accuracy: {0:2.2f}%'.format(rfc_acc*100))

Results:
--------------------------------------------------------------------------------
A single decision tree - accuracy: 74.65%
--------------------------------------------------------------------------------
A random forest ensemble - accuracy: 81.69%


In [34]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator(labelCol="Loan_Status_index")


In [35]:
dtc_acc_2 = evaluator.evaluate(dtc_predictions)
rfc_acc_2 = evaluator.evaluate(rfc_predictions)

In [36]:
print("Results")
print('-'*60)
print('A single decision tree has an AUC of: {0:2.2f}%'.format(dtc_acc_2*100))
print('-'*60)
print('A random forest ensemble has an AUC of: {0:2.2f}%'.format(rfc_acc_2*100))

Results
------------------------------------------------------------
A single decision tree has an AUC of: 34.36%
------------------------------------------------------------
A random forest ensemble has an AUC of: 79.22%


| **Mô hình**                | **Độ chính xác (%)** | **AUC (%)** | **Ưu điểm**                                                                                      | **Nhược điểm**                                               |
|---------------------------|-----------------------|--------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------|
| Decision Tree Classifier  | 74.65                 | 34.36        | - Dễ hiểu, dễ giải thích<br>- Tính ổn định                                                        | - Dễ bị overfitting<br>- Độ chính xác thấp                   |
| Random Forest Classifier  | 81.69%                 | 82.25        | - Độ chính xác cao<br>- AUC cao<br>- Giảm overfitting<br>- Tính ổn định<br>- Dễ tinh chỉnh      | - Tốn nhiều thời gian huấn luyện hơn so với Decision Tree   |



- Random Forest Classifier có độ chính xác và AUC cao hơn so với Decision Tree.
- Random Forest có khả năng giảm overfitting và tính ổn định cao, đồng thời dễ dàng để tinh chỉnh.

=> Do đó, chọn Random Forest Classifier làm mô hình cuối cùng để dự đoán liệu một khách hàng có được duyệt cho vay hay không.

In [37]:
test = spark.read.csv("HomeLoan/loan_sanction_test.csv", header=True, inferSchema=True)

In [38]:
df_test = spark.read.csv("HomeLoan/loan_sanction_test.csv", header=True, inferSchema=True)

In [39]:
df_test.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+
|LP001015|  Male|    Yes|         0|    Graduate|           No|           5720|                0|       110|             360|             1|        Urban|
|LP001022|  Male|    Yes|         1|    Graduate|           No|           3076|             1500|       126|             360|             1|        Urban|
|LP001031|  Male|    Yes|         2|    Graduate|           No|           5000|             1800|       208|             360|             1|        Urban|
|LP001035|  Male|    Yes|         2|    Graduate|           No|       

In [40]:
df_test = df_test.na.drop()

In [41]:
df_test = df_test.select(selected_features)

In [42]:
df_test = pipeline.fit(df_test).transform(df_test)

In [43]:
df_test = encoder.fit(df_test).transform(df_test)

In [44]:
df_test = assembler.transform(df_test)

In [45]:
df_test = scalerModel.transform(df_test)

In [46]:
df_test.show(5)

+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+------------+-------------+---------------+-------------------+-------------------+--------------+---------------+-----------------+---------------------+---------------------+--------------------+--------------------+
|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Gender_index|Married_index|Education_index|Self_Employed_index|Property_Area_index|Gender_encoded|Married_encoded|Education_encoded|Self_Employed_encoded|Property_Area_encoded|   features_unscaled|            features|
+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+------------+-------------+---------------+-------------------+-------------------+--------------+---------------+-------------

In [47]:
# Make predictions
rfc_predictions_new_customers = rfc_model.transform(df_test)

In [48]:
print("Random Forest Classifier Predictions:")
rfc_predictions_new_customers.select("features", "prediction").show()

Random Forest Classifier Predictions:
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[-2.1381886668846...|       0.0|
|[0.46671123185806...|       1.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[-2.1381886668846...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       0.0|
|[-2.1381886668846...|       0.0|
|[-2.1381886668846...|       0.0|
|[0.46671123185806...|       0.0|
|[0.46671123185806...|       1.0|
+--------------------+----------+
only showing top 20 rows



In [49]:
df_test_with_predictions = test.join(rfc_predictions_new_customers.select("prediction"), how='inner')

# Show predictions
df_test_with_predictions.show()

+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+----------+
| Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|prediction|
+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+----------+
|LP001015|  Male|    Yes|         0| Graduate|           No|           5720|                0|       110|             360|             1|        Urban|       0.0|
|LP001015|  Male|    Yes|         0| Graduate|           No|           5720|                0|       110|             360|             1|        Urban|       0.0|
|LP001015|  Male|    Yes|         0| Graduate|           No|           5720|                0|       110|             360|             1|        Urban|       0.0|
|LP001015|  Male|    Y

In [50]:
df_test_with_predictions.toPandas().to_csv('test_predictions.csv', header=True, index=False)