In [1]:
# Importing Dependencies (given to us)
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# Importing the generated data that was compiled into the csv in the Resources folder (also given to us)
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [6]:
# (Displaying the data)
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [7]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [12]:
# Convert categorical data to numeric and separate target feature for training data (basically removing most columns that
# are string-based rather than number-based, for both datasets)
# [As an extra note, any alteration was made through a dictionary rather than a bin or a list, for more static conversions]
dict_answer = {"N":0, "Y":1}
train_df_altered = train_df.replace({"hardship_flag":dict_answer, "debt_settlement_flag":dict_answer})

dict_ownership = {"ANY":0, "RENT":1, "MORTGAGE":2, "OWN":3}
train_df_altered = train_df_altered.replace({"home_ownership":dict_ownership})

dict_verify = {"Not Verified":0, "Source Verified":1,"Verified":1} # (This was kept as "1" as verification type did not
                                                                   # matter, as long as it was verified to begin with)
train_df_altered = train_df_altered.replace({'verification_status':dict_verify})

dict_loan = {"low_risk":0, "high_risk":1}
train_df_altered = train_df_altered.replace({"loan_status":dict_loan})

dict_status = {"w":0, "f":1}
train_df_altered = train_df_altered.replace({"initial_list_status":dict_status})

dict_application = {"Joint App":0, "Individual":1}
train_df_altered = train_df_altered.replace({"application_type":dict_application})

train_df_altered = train_df_altered.drop(["index","pymnt_plan"],axis="columns") # (The payment plan column was dropped as
                                                                                # it is unimportant to credit risk)

train_df_altered = train_df_altered.drop(["Unnamed: 0"],axis="columns") # (Dropping essentially null values)

# (May or may not be necessary, but nice to have)
altered_data_path = Path("Resources/Altered Credit Data (2019).csv")
train_df_altered.to_csv(altered_data_path, index=False)

In [13]:
# (Display the new 2019 data)
train_df_altered.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375.0,0.1797,483.34,2,223000.0,0,0,29.99,0.0,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,0,0
1,21000.0,0.1308,478.68,2,123000.0,1,0,11.26,2.0,0.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,0,0
2,20000.0,0.124,448.95,2,197000.0,1,0,11.28,0.0,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,0,0
3,3000.0,0.124,100.22,1,45000.0,0,0,18.08,0.0,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,0,0
4,30000.0,0.1612,1056.49,2,133000.0,1,0,27.77,0.0,2.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,0,0


In [19]:
# (Creating an x and y dataframe set that can be used for the analysis later on)
train_x = train_df_altered.drop("loan_status", axis=1)
train_y = train_df_altered["loan_status"].values # (Since the null values were dropped earlier, no dummy variables need to
print(train_x.select_dtypes(include=[object]))   # be created for this dataframe) [this is just to show its working]

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[12180 rows x 0 columns]


In [20]:
# Convert categorical data to numeric and separate target feature for testing data (basically doing the exact same thing
# as last time) [Thankfully there is no need to re-declare the dictionaries]
test_df_altered = test_df.replace({"hardship_flag":dict_answer, "debt_settlement_flag":dict_answer})
test_df_altered = test_df_altered.replace({"home_ownership":dict_ownership})
test_df_altered = test_df_altered.replace({'verification_status':dict_verify})
test_df_altered = test_df_altered.replace({"loan_status":dict_loan})
test_df_altered = test_df_altered.replace({"initial_list_status":dict_status})
test_df_altered = test_df_altered.replace({"application_type":dict_application})

test_df_altered = test_df_altered.drop(["index","pymnt_plan"],axis="columns")
test_df_altered = test_df_altered.drop(["Unnamed: 0"],axis="columns")

altered_data_path2 = Path("Resources/Altered Credit Data (2020).csv")
test_df_altered.to_csv(altered_data_path2, index=False)

In [21]:
# (Display the new 2020 data)
test_df_altered.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000.0,0.0819,814.7,2,140000.0,0,0,19.75,0.0,1.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,0,0
1,6000.0,0.1524,208.7,1,55000.0,0,0,11.52,2.0,0.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,0,0
2,3600.0,0.1695,128.27,1,42000.0,0,0,6.74,0.0,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,0,0
3,20000.0,0.1524,478.33,1,100000.0,0,0,12.13,0.0,2.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,0,0
4,3600.0,0.124,120.27,1,50000.0,0,0,16.08,0.0,3.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,0,0


In [25]:
# add missing dummy variables to testing set (and testing out the data like last time)
test_df_altered = pd.get_dummies(test_df_altered)

test_x = test_df_altered.drop("loan_status", axis=1)
test_y = test_df_altered["loan_status"].values
print(test_x.select_dtypes(include=[object]))

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[4702 rows x 0 columns]


## Prediction Based on Current Data (Unscaled)
Depsite my limited understanding of how credit works and the nature of credit loans, I think that the "Random Forest Classifier" model will work better in this situation, as the nature of credit and loans are influenced by outside sources and a more linear model may skew the results.

In [30]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
regression_model = LogisticRegression(solver='lbfgs',max_iter=200)
regression_model.fit(train_x, train_y)

print(f"Training Data Score: {regression_model.score(train_x, train_y)}")
print(f"Testing Data Score: {regression_model.score(test_x, test_y)}")

Training Data Score: 0.6522167487684729
Testing Data Score: 0.5159506592939175


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(random_state=1, n_estimators=500).fit(train_x, train_y)

print(f"Training Score: {random_forest.score(train_x, train_y)}")
print(f"Testing Score: {random_forest.score(test_x, test_y)}")

Training Score: 1.0
Testing Score: 0.6412165036154828


## Results from Testing (Unscaled)
According to the results from the Regression model and the Random Forest model, it seems that the Random Forest model performed better than the Regression model. Personally, I am somewhat skeptical of the results as surprisingly the latter gave out the "Training Score" of a 1.0, which seems too perfect. However, the "Testing Score" did recieve a higher number that is closer to 1, so it is more accurate in that regard. 

In [32]:
# Scale the data
from sklearn.preprocessing import StandardScaler
data_scale = StandardScaler()
train_x_scaled = data_scale.fit_transform(train_x)
test_x_scaled = data_scale.fit_transform(test_x)

## Predictions Based on Data (Scaled)
Despite scaling the data, I think that the Random Forest model will still perform better than the Regression model, primarly for the reasons mentioned previously.

In [33]:
# Train the Logistic Regression model on the scaled data and print the model score
regression_model.fit(train_x_scaled, train_y)

print(f"Training (Scaled) Data Score: {regression_model.score(train_x_scaled, train_y)}")
print(f"Testing (Scaled) Data Score: {regression_model.score(test_x_scaled, test_y)}")

Training (Scaled) Data Score: 0.7061576354679803
Testing (Scaled) Data Score: 0.6601446193109315


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
# Train a Random Forest Classifier model on the scaled data and print the model score (values were delcared previously)
print(f"Training (Scaled) Score: {random_forest.score(train_x_scaled, train_y)}")
print(f"Testing (Scaled) Score: {random_forest.score(test_x_scaled, test_y)}")

Training (Scaled) Score: 0.5
Testing (Scaled) Score: 0.5


## Results From Testing (Scaled)
To my surprise, the scaled testing results show that the the Logistic Regression model provided better results when compared to the Random Forest model. Oddly the latter model's results are exactly the same and perfect numbers, but compared to unscaled, the Regression model saw some improvement to its accuracy. 

## Conclusion
Based on the analyzed data alone, I will say that using scaled Logistic Regression models on the credit card data proves to be the most accurate (not including the unscaled 1.0 result as I have my personal skepticism on it) in terms of understanding whether or not a loan from "LendingClub" will result in a high risk endeavour. Personally, this data alone is not sufficient in being applicable towards real life circumstances (partially based on my lack in understanding credit functionalities) and the fact that external forces can skew the results in ways previously thought not possible. However, it is good as a basis for understanding the basic concepts in what is possible.