In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PredictDefaulter').getOrCreate()

## Read Data. 
Reading the data that is cleaned earlier.

In [2]:
from pyspark.sql.types import *

In [3]:
data = spark.read.csv('Data/CleanedDataOutput', header = True, inferSchema = True)

In [4]:
data.printSchema()

root
 |-- loan_amnt: double (nullable = true)
 |-- funded_amnt: double (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: double (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- fico_range_low: double (nullable = true)
 |-- fico_range_high: double (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- revol_bal: double (nullable = true)
 |-- revol_util: double (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- total_

In [5]:
columns = data.columns

### Check Null Values

In [6]:
import pyspark.sql.functions as f

In [7]:
# Check null values
data_null = data.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in data.columns[:]])
null_values = data_null.head(1)[0]
null_values.asDict()

{'loan_amnt': 0,
 'funded_amnt': 0,
 'funded_amnt_inv': 0,
 'term': 0,
 'int_rate': 0,
 'installment': 0,
 'grade': 0,
 'emp_length': 0,
 'home_ownership': 0,
 'annual_inc': 0,
 'verification_status': 0,
 'loan_status': 0,
 'pymnt_plan': 0,
 'addr_state': 0,
 'dti': 0,
 'delinq_2yrs': 0,
 'fico_range_low': 0,
 'fico_range_high': 0,
 'pub_rec': 0,
 'revol_bal': 0,
 'revol_util': 0,
 'initial_list_status': 0,
 'total_pymnt': 0,
 'total_pymnt_inv': 0,
 'total_rec_prncp': 0,
 'total_rec_int': 0,
 'application_type': 0,
 'acc_now_delinq': 0,
 'tot_cur_bal': 0,
 'total_rev_hi_lim': 0,
 'avg_cur_bal': 0,
 'bc_open_to_buy': 0,
 'bc_util': 0,
 'chargeoff_within_12_mths': 0,
 'delinq_amnt': 0,
 'mort_acc': 0,
 'num_accts_ever_120_pd': 0,
 'num_actv_bc_tl': 0,
 'num_actv_rev_tl': 0,
 'num_bc_sats': 0,
 'num_bc_tl': 0,
 'num_il_tl': 0,
 'num_op_rev_tl': 0,
 'num_rev_accts': 0,
 'num_rev_tl_bal_gt_0': 0,
 'num_sats': 0,
 'pct_tl_nvr_dlq': 0,
 'pub_rec_bankruptcies': 0,
 'tax_liens': 0,
 'tot_hi_cre

# EDA

In [8]:
from pyspark.sql import functions as f
from pyspark.sql.functions import desc

For simplicity we will be only considering applicants that have taken loan Individually (where application_type = 'Individual')

In [9]:
data_individual = data.filter(data['application_type'] == 'Individual').drop('application_type')
data_individual.count()

2215254

Top 10 states with the applicants

In [10]:
# addr_state
data_individual.groupBy('addr_state').count().orderBy(desc('count')).show(10)

+----------+------+
|addr_state| count|
+----------+------+
|        CA|310445|
|        TX|185967|
|        NY|183661|
|        FL|157411|
|        IL| 90197|
|        NJ| 83029|
|        PA| 74536|
|        GA| 73234|
|        OH| 72791|
|        VA| 61793|
+----------+------+
only showing top 10 rows



Majority of applicants are from California

In [11]:
# loan_status
data.groupBy('loan_status').count().show(truncate = False)

+------------------+-------+
|loan_status       |count  |
+------------------+-------+
|Fully Paid        |980714 |
|Default           |36     |
|In Grace Period   |10313  |
|Charged Off       |238633 |
|Late (31-120 days)|23993  |
|Current           |1104088|
|Late (16-30 days) |4856   |
+------------------+-------+



### Meaning of different loan status:
* Current: Loan is up to date on all outstanding payments. 
* In Grace Period: Loan is past due but within the 15-day grace period.  
* Late (16-30): Loan has not been current for 16 to 30 days.
* Late (31-120): Loan has not been current for 31 to 120 days.
* Fully paid: Loan has been fully repaid, either at the expiration of the 3- or 5-year year term or as a result of a prepayment.
* Default: Loan has not been current for an extended period of time.
* Charged Off: Loan for which there is no longer a reasonable expectation of further payments. Upon Charge Off, the remaining principal balance of the Note is deducted from the account balance.

### Difference between "Charged Off" and "Default"
Loans that are in "Default" are loans for which borrowers have failed to make payments for an extended period of time.
 
A loan becomes “Charged Off” when there is no longer a reasonable expectation of further payments.  Charge Off typically occurs when a loan is 120 days or more past due and there is no reasonable expectation of sufficient payment to prevent the charge off.  In certain circumstances, loans may be charged off at an earlier or later date.
 
A loan that is in “Default” will still appear in your Notes, in the status of “Default,” while a loan that has been “Charged Off” will appear as charged off, and the remaining principal balance of the Note will be deducted from your account balance.

Note: For simplicity, not considerating the applicants who "Does not meet the credit policy."

### Analysing how many applicants currently paying the loan might become defaulter and separating out the applicants with "loan_status" as "Current"

In [12]:
# The data with "current" status will later be use for evaluation purpose.
df_current = data_individual.filter(data["loan_status"] == "Current")

In [13]:
total_current_applicants = df_current.count()

In [14]:
df = data_individual.filter(data["loan_status"] != 'Current')
df.select('loan_status').distinct().show(truncate=False)
total_applicant = df.count()

+------------------+
|loan_status       |
+------------------+
|Fully Paid        |
|Default           |
|In Grace Period   |
|Charged Off       |
|Late (31-120 days)|
|Late (16-30 days) |
+------------------+



###### To predict the risk whether an applicatant will pay the loan or not, considering "loan_status" in 3 categories: 

* safe : no risk in repayment (Applicant will fully pay the loan without any distress)
* low : low risk (Applicant might face some distress during the repayment of the loan)
* high : high risk (Applicant might face high distress inhibiting the ability to repay the loan)

In [15]:
df = df.withColumn('default_risk', f.when(f.col('loan_status') == "Fully Paid", 'safe')
                   .when(((f.col('loan_status') == "In Grace Period") | 
                                   (f.col('loan_status') == "Late (31-120 days)") |
                         (f.col('loan_status') == 'Late (16-30 days)')) , "low")
                   .otherwise("high"))

In [16]:
df.select('loan_status', 'default_risk').distinct().orderBy('default_risk').show()

+------------------+------------+
|       loan_status|default_risk|
+------------------+------------+
|       Charged Off|        high|
|           Default|        high|
| Late (16-30 days)|         low|
|   In Grace Period|         low|
|Late (31-120 days)|         low|
|        Fully Paid|        safe|
+------------------+------------+



Analysis of data:

In [17]:
df.describe(['annual_inc', 'loan_amnt', 'int_rate', 'dti', 'mort_acc']).show()

+-------+-----------------+------------------+------------------+------------------+------------------+
|summary|       annual_inc|         loan_amnt|          int_rate|               dti|          mort_acc|
+-------+-----------------+------------------+------------------+------------------+------------------+
|  count|          1228636|           1228636|           1228636|           1228636|           1228636|
|   mean| 78695.5284027327|14740.847777535413|13.303155751592813|18.213233423080506|1.6445106606024893|
| stddev|53904.98488219384| 8770.525798898754| 4.816573433807678| 8.302813179558434|1.9840999102619745|
|    min|           2000.0|            1000.0|              5.31|               1.0|               0.0|
|    max|        4000000.0|           40000.0|             30.99|             49.96|              47.0|
+-------+-----------------+------------------+------------------+------------------+------------------+



In [18]:
df.groupBy('emp_length').count().orderBy(desc('count')).show()

+----------+------+
|emp_length| count|
+----------+------+
| 10+ years|435362|
|   2 years|117840|
|   3 years|104282|
|  < 1 year|101352|
|    1 year| 86050|
|   5 years| 80768|
|   4 years| 77105|
|   6 years| 60118|
|   8 years| 59046|
|   7 years| 57346|
|   9 years| 49367|
+----------+------+



Majority of the applicants have more than 10 years of employement.

### Analysing Applicants that are 'safe'

In [19]:
risk_safe = df.filter(df['default_risk'] == 'safe')

In [20]:
total_risk_safe = risk_safe.count()

In [21]:
risk_safe.groupBy('emp_length').count().orderBy(desc('count')).withColumn('percentage', 
                                                                          (f.col('count') / total_risk_safe)*100).show()

+----------+------+------------------+
|emp_length| count|        percentage|
+----------+------+------------------+
| 10+ years|344847| 35.89038273000331|
|   2 years| 91502|  9.52318506630698|
|   3 years| 80860| 8.415605609293593|
|  < 1 year| 77718| 8.088598030461037|
|    1 year| 66154|  6.88506027055662|
|   5 years| 62902|6.5466043041774125|
|   4 years| 59819| 6.225737224119879|
|   6 years| 47124| 4.904489224985793|
|   8 years| 46177|4.8059290158341605|
|   7 years| 45097| 4.693526665376121|
|   9 years| 38634| 4.020881858885093|
+----------+------+------------------+



35% of the total applicants that have fully paid the loan and have employment length more than 10 years.

In [22]:
risk_safe.groupBy('addr_state').count().orderBy(desc('count')).withColumn('percentage', 
                                                                          (f.col('count') / total_risk_safe)*100).show(5)

+----------+------+------------------+
|addr_state| count|        percentage|
+----------+------+------------------+
|        CA|140475|14.620111278326952|
|        TX| 80019| 8.328077482686917|
|        NY| 75810| 7.890020544651834|
|        FL| 65768| 6.844886837892913|
|        IL| 38042|  3.95926871863402|
+----------+------+------------------+
only showing top 5 rows



In [23]:
# term
risk_safe.groupBy('term').count().withColumn('percentage', (f.col('count') / total_risk_safe)*100).show()

+---------+------+------------------+
|     term| count|        percentage|
+---------+------+------------------+
|36 months|761950| 79.30089901065116|
|60 months|198884|20.699100989348835|
+---------+------+------------------+



79% of applicants preferred 36 months term period.

In [24]:
# home_ownership
risk_safe.groupBy('home_ownership').count().withColumn('percentage', 
                                                       (f.col('count') / total_risk_safe)*100).orderBy(
                                                                                            desc('count')).show()

+--------------+------+--------------------+
|home_ownership| count|          percentage|
+--------------+------+--------------------+
|      MORTGAGE|493305|   51.34133471546593|
|          RENT|367241|   38.22106628200084|
|           OWN| 99754|  10.382022284806741|
|           ANY|   467| 0.04860360894805971|
|          NONE|    34|0.003538592514419...|
|         OTHER|    33|0.003434516263995...|
+--------------+------+--------------------+



Majority of the applicants have mortaged their home.

In [25]:
# verification_status
risk_safe.groupBy('verification_status').count().orderBy(desc('count')).show()

+-------------------+------+
|verification_status| count|
+-------------------+------+
|    Source Verified|387522|
|       Not Verified|313583|
|           Verified|259729|
+-------------------+------+



### Applicants on 'High' risk

In [26]:
risk_high = df.filter(df['default_risk'] == 'high')

In [27]:
total_risk_high = risk_high.count()

In [28]:
risk_high.groupBy('emp_length').count().orderBy(
    desc('count')).withColumn('percentage', 
                               (f.col('count') / total_risk_high)*100).show()

+----------+-----+------------------+
|emp_length|count|        percentage|
+----------+-----+------------------+
| 10+ years|78987| 33.88502078480328|
|   2 years|22774| 9.769930030930533|
|   3 years|20313|  8.71417356275981|
|  < 1 year|20076| 8.612501769603995|
|    1 year|17250|  7.40016216007516|
|   5 years|15452| 6.628829315796021|
|   4 years|14835|6.3641394576646375|
|   8 years|11519| 4.941592343298885|
|   6 years|11341| 4.865231249705066|
|   7 years|10909| 4.679905449522314|
|   9 years| 9647| 4.138513875840294|
+----------+-----+------------------+



Similar to applicants that are 'safe', applicants at high risk also have majority of the applicants with more than 10 years of experience.

In [29]:
risk_high.groupBy('addr_state').count().orderBy(desc('count')).withColumn('percentage', 
                                                                          (f.col('count') / total_risk_high)*100).show(5)

+----------+-----+------------------+
|addr_state|count|        percentage|
+----------+-----+------------------+
|        CA|33443| 14.34687670257354|
|        NY|21244| 9.113567821949955|
|        TX|19470| 8.352530855458745|
|        FL|17449| 7.485532146733418|
|        NJ| 9059|3.8862648700359927|
+----------+-----+------------------+
only showing top 5 rows



Majority of the applicants are from California.

In [30]:
# term
risk_high.groupBy('term').count().show()

+---------+------+
|     term| count|
+---------+------+
|36 months|139029|
|60 months| 94074|
+---------+------+



In [31]:
# home_ownership
risk_high.groupBy('home_ownership').count().withColumn('percentage', 
                               (f.col('count') / total_risk_high)*100).orderBy(desc('count')).show()

+--------------+------+--------------------+
|home_ownership| count|          percentage|
+--------------+------+--------------------+
|          RENT|109042|   46.77846273964728|
|      MORTGAGE| 99357|   42.62364705730943|
|           OWN| 24590|  10.548984783550619|
|           ANY|   100|0.042899490783044406|
|         OTHER|     8|0.003431959262643...|
|          NONE|     6|0.002573969446982664|
+--------------+------+--------------------+



Majority of the applicants at high risk rented the house.

In [32]:
# verification_status
risk_high.groupBy('verification_status').count().orderBy(desc('count')).show()

+-------------------+------+
|verification_status| count|
+-------------------+------+
|    Source Verified|101290|
|           Verified| 79511|
|       Not Verified| 52302|
+-------------------+------+



## Comparing applicants with 'safe' and 'high' risk

In [33]:
risk_safe.describe(['tot_cur_bal', 'dti', 'fico_range_low', 'fico_range_high']).show()

+-------+------------------+-----------------+-----------------+------------------+
|summary|       tot_cur_bal|              dti|   fico_range_low|   fico_range_high|
+-------+------------------+-----------------+-----------------+------------------+
|  count|            960834|           960834|           960834|            960834|
|   mean|149146.79575035855|17.75943490759058|697.4491847707304| 701.4493127845185|
| stddev|163503.35628624592|8.182468059114372| 32.1849966404692|32.185585498192346|
|    min|               0.0|              1.0|            660.0|             664.0|
|    max|         5445012.0|            49.96|            845.0|             850.0|
+-------+------------------+-----------------+-----------------+------------------+



In [34]:
risk_high.describe(['tot_cur_bal', 'dti', 'fico_range_low', 'fico_range_high']).show()

+-------+------------------+-----------------+-----------------+-----------------+
|summary|       tot_cur_bal|              dti|   fico_range_low|  fico_range_high|
+-------+------------------+-----------------+-----------------+-----------------+
|  count|            233103|           233103|           233103|           233103|
|   mean|121889.50687035345|19.99958726399929|687.0949108334084|691.0949666027465|
| stddev|136314.01115054343|8.500327836422429|25.07914407358612| 25.0794963232977|
|    min|               0.0|              1.0|            660.0|            664.0|
|    max|         3437283.0|            49.92|            845.0|            850.0|
+-------+------------------+-----------------+-----------------+-----------------+



* As we can see, the average total current balance ('tot_cur_bal') of applicants is higher for the applicants that are safe.
* The Depth to income ratio ('dti) on average is higher for applicants at high risk, i.e., these applicants are earning less campared to their debt
* Applicants with risk safe have higher FICO score as compare to applicants to high risk


In [35]:
risk_safe.describe(['delinq_2yrs', 'pub_rec', 'delinq_amnt', 'pub_rec_bankruptcies']).show()

+-------+------------------+-------------------+------------------+--------------------+
|summary|       delinq_2yrs|            pub_rec|       delinq_amnt|pub_rec_bankruptcies|
+-------+------------------+-------------------+------------------+--------------------+
|  count|            960834|             960834|            960834|              960834|
|   mean| 0.319005155937446|0.20838771317417992|14.194593446942969| 0.12985489689165872|
| stddev|0.8786005595481192| 0.5913702714030439| 773.6206934564974|  0.3713156286661652|
|    min|                 0|                  0|               0.0|                 0.0|
|    max|                39|                 63|          185408.0|                12.0|
+-------+------------------+-------------------+------------------+--------------------+



In [36]:
risk_high.describe(['delinq_2yrs', 'pub_rec', 'delinq_amnt', 'pub_rec_bankruptcies']).show()

+-------+-------------------+-------------------+------------------+--------------------+
|summary|        delinq_2yrs|            pub_rec|       delinq_amnt|pub_rec_bankruptcies|
+-------+-------------------+-------------------+------------------+--------------------+
|  count|             233103|             233103|            233103|              233103|
|   mean|0.36051445069347027|0.24633745597439757|20.694813022569424| 0.15174836874686298|
| stddev| 0.9512137192283301| 0.6653008059429222|1026.5632144741928| 0.40478093739772136|
|    min|                  0|                  0|               0.0|                 0.0|
|    max|                 27|                 86|          249925.0|                11.0|
+-------+-------------------+-------------------+------------------+--------------------+



* Applicants with higher risk have more past due amount owed for the accounts on which the borrower is now delinquent ('delinq_amnt') as compare to applicants that are safe.
* Applicants with higher risk have more public record for bankruptcies('pub_rec_bankruptcies') and number of derogatory public records ('pub_rec') as compare to safe applicants

## Indexing 'default_risk'

In [37]:
from pyspark.ml.feature import StringIndexer

In [38]:
# Convert loan_default into label indices using the StringIndexer
label_stringIndex = StringIndexer(inputCol="default_risk", outputCol="label")
df_label = label_stringIndex.fit(df).transform(df)
df_label.select('label', 'default_risk').distinct().show()

+-----+------------+
|label|default_risk|
+-----+------------+
|  1.0|        high|
|  2.0|         low|
|  0.0|        safe|
+-----+------------+



### Split data into training and test dataset

In [39]:
train_data, test_data = df_label.randomSplit([0.7, 0.3])

In [40]:
train_count = train_data.count()

In [41]:
train_data.groupBy('default_risk').count().withColumn('percentage', 
                                                             (f.col('count')*100/train_count)).show()

+------------+------+-----------------+
|default_risk| count|       percentage|
+------------+------+-----------------+
|         low| 24223|2.817512474846754|
|        safe|672497| 78.2218836146232|
|        high|163010|18.96060391053005|
+------------+------+-----------------+



The data is skewed with approx 19% data of applicants at high risk, 78% safe and 3% applicants at lower risk which makes the data highly Unbalanced. Oversampling the applicants data that are at 'low' risk  and undersampling the applicants that are'safe' can help balance the data.

### Separating out different risk categories

In [42]:
high = train_data.filter(train_data['default_risk'] == 'high')
high_count = high.count()

In [43]:
low = train_data.filter(train_data['default_risk'] == 'low')
low_count = low.count()

In [44]:
safe = train_data.filter(train_data['default_risk'] == 'safe')
safe_count = safe.count()

### Oversampling data from "low" risk status

In [45]:
from pyspark.sql.functions import explode, array, lit

oversample_ratio = int(high_count / low_count)
low_range = range(oversample_ratio)
oversampled_low = low.withColumn("dummy", explode(array([lit(x) for x in low_range]))).drop('dummy')
print(f'"low" risk count after oversampling {oversampled_low.count()}')

"low" risk count after oversampling 145338


### Undersampling data from "safe" risk status

In [46]:
undersample_ratio = float(high_count / safe_count)
undersample_safe = safe.sample(withReplacement = False, fraction = undersample_ratio)
print(f'"safe" count after undersampling {undersample_safe.count()}')

"safe" count after undersampling 162806


### Joining all the dataframes

In [47]:
final_train_df = oversampled_low.union(undersample_safe).union(high)
final_train_df.groupBy('default_risk').count().withColumn('percentage', 
                                                             (f.col('count')*100/train_count)).show()

+------------+------+------------------+
|default_risk| count|        percentage|
+------------+------+------------------+
|         low|145338|16.905074849080524|
|        safe|162806|18.936875530689868|
|        high|163010| 18.96060391053005|
+------------+------+------------------+



## Creating the Pipeline

In [48]:
from pyspark.ml.feature import VectorIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline

In [49]:
categorical_columns = ['term', 'grade',
                       'emp_length', 'home_ownership', 
                       'verification_status', 'pymnt_plan',
                       'addr_state', 'initial_list_status']

In [50]:
numeric_columns=['loan_amnt', 'funded_amnt', 'funded_amnt_inv',
                 'int_rate', 'installment', 'annual_inc',
                 'dti', 'delinq_2yrs', 'fico_range_low',
                 'fico_range_high', 'pub_rec', 'revol_bal',
                 'revol_util', 'total_pymnt', 'total_pymnt_inv',
                 'total_rec_prncp', 'total_rec_int', 'acc_now_delinq',
                 'tot_cur_bal', 'total_rev_hi_lim', 'avg_cur_bal',
                 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths',
                 'delinq_amnt', 'mort_acc', 'num_accts_ever_120_pd',
                 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats',
                 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',
                 'pct_tl_nvr_dlq', 'pub_rec_bankruptcies', 'tax_liens',
                 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                 'total_il_high_credit_limit']

In [51]:
columns = final_train_df.columns

## Defining Pipeline

In [52]:
stages = [] # stages in our Pipeline
for categoricalCol in categorical_columns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index").setHandleInvalid("skip")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

In [53]:
assemblerInputs = [c + "classVec" for c in categorical_columns] + numeric_columns
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [54]:
stages

[StringIndexer_70b6470bda3d,
 OneHotEncoder_5d83bbd3c0bc,
 StringIndexer_6fc41add62fa,
 OneHotEncoder_b1f3fa507b3a,
 StringIndexer_d0157ec10ea0,
 OneHotEncoder_9d587c6396dc,
 StringIndexer_08bafa4cf3a5,
 OneHotEncoder_951d7f2f109c,
 StringIndexer_af280ed71c1b,
 OneHotEncoder_33677ab5838c,
 StringIndexer_1b86e1b16315,
 OneHotEncoder_4f6fc6e0eaba,
 StringIndexer_b3dd542f4a9b,
 OneHotEncoder_09d6066d328b,
 StringIndexer_b37e5a7e406b,
 OneHotEncoder_1ebe5a8fee94,
 VectorAssembler_2a34bd1ac58b]

In [55]:
pipeline = Pipeline().setStages(stages)
pipelineModel = pipeline.fit(final_train_df)

### Preparing train and test data

In [56]:
train_prep_df = pipelineModel.transform(final_train_df)
test_prep_df = pipelineModel.transform(test_data)

In [57]:
# Keeping relevant columns
selected_columns = ["features"] + columns
train_prep_df = train_prep_df.select(selected_columns)

test_prep_df = test_prep_df.select(selected_columns)

## Models

Identifing which model will be best for the prediction. For this we are using 3 classification Algorithms:
* Logistic Regression
* Decision Tree
* Random Forest

### Logistic Regression

In [58]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol='label')

# fit the model
lr_model = lr.fit(train_prep_df)

In [59]:
# model summary
training_summary = lr_model.summary
training_summary.accuracy

0.8459696829486749

In [60]:
# Predict
lr_predictions = lr_model.transform(test_prep_df)

**Model Accuracy**

Checking model accuracy:

In [61]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [62]:
# MulticlassClassificationEvaluator for calculating Accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label',
                                             metricName='accuracy')
evaluator.evaluate(lr_predictions)

0.9452651895062699

## Decision Tree

In [63]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

In [64]:
dt = DecisionTreeClassifier()

# Fit
dt_model = dt.fit(train_prep_df)

In [65]:
# Predict
dt_predictions = dt_model.transform(test_prep_df)

In [66]:
dt_model.featureImportances

SparseVector(118, {74: 0.0437, 75: 0.2191, 77: 0.0316, 79: 0.0161, 89: 0.0157, 90: 0.5858, 91: 0.088})

**Model Accuracy**

In [67]:
evaluator.evaluate(dt_predictions)

0.9179357343063003

## Random Forest

Predicting on different number of trees

In [68]:
numTrees = [50, 100, 200, 300, 400]
for num in numTrees:
    rf = RandomForestClassifier(numTrees=num)
    
    # model
    rf_model = rf.fit(train_prep_df)
    
    # Predict
    rf_predictions = rf_model.transform(test_prep_df)
    
    # Accuracy
    acc = evaluator.evaluate(rf_predictions)
    print(f'Accuracy with number of trees {num} is: {acc}')

Accuracy with number of trees 50 is: 0.8887521482437261
Accuracy with number of trees 100 is: 0.8823196152949532
Accuracy with number of trees 200 is: 0.8923058990637182
Accuracy with number of trees 300 is: 0.8876461754484882
Accuracy with number of trees 400 is: 0.8895003063110928


## Evaluating whether the applicant, who is under current status, will face any financial distress in the future i.e. in the context of repaying the loan

In [69]:
predicted_current_df = pipelineModel.transform(df_current)

### Using Logistic Regression

In [70]:
# Predict
lr_risk_prediction = lr_model.transform(predicted_current_df)

In [71]:
from pyspark.ml.feature import IndexToString 

converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=['safe', 'high', 'low'])


In [72]:
original_label_df = converter.transform(lr_risk_prediction)
original_label_df.groupBy('prediction', "predicted_label").count().show()

+----------+---------------+------+
|prediction|predicted_label| count|
+----------+---------------+------+
|       1.0|           high|194151|
|       2.0|            low|760471|
|       0.0|           safe| 31996|
+----------+---------------+------+



In [73]:
high_risk_applicants = original_label_df.filter(original_label_df['predicted_label'] == 'high').count()

In [74]:
print(f"percentage of people that may default the loan : {(high_risk_applicants / total_current_applicants)*100 }")

percentage of people that may default the loan : 19.6784368418172
