# Importing Libraries

In [22]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Reading data 

In [2]:
ld=pd.read_csv('C:/Users/Jay/Desktop/Python/Projects/Loans_LinearModel/loan_data_train.csv')
ld.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,79542.0,25000,25000.0,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years
1,75473.0,19750,19750.0,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.5,710-714,14,19070,3.0,4 years
2,67265.0,2100,2100.0,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.0,690-694,13,893,1.0,< 1 year
3,80167.0,28000,28000.0,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years


# Data cleaning 

In [3]:
for col in ["Interest.Rate","Debt.To.Income.Ratio"]:
    ld[col]=ld[col].astype("str")
    ld[col]=[x.replace("%","") for x in ld[col]]

for col in ["Amount.Requested","Amount.Funded.By.Investors","Open.CREDIT.Lines","Revolving.CREDIT.Balance",
           "Inquiries.in.the.Last.6.Months","Interest.Rate","Debt.To.Income.Ratio"]:
    ld[col]=pd.to_numeric(ld[col],errors="coerce")

ld["Loan.Length"].value_counts()

ll_dummies=pd.get_dummies(ld["Loan.Length"])

ll_dummies.head()

ld["LL_36"]=ll_dummies["36 months"]

ld=ld.drop('Loan.Length',axis=1)

ld["Loan.Purpose"].value_counts()

round(ld.groupby("Loan.Purpose")["Interest.Rate"].mean())

for i in range(len(ld.index)):
    if ld["Loan.Purpose"][i] in ["car","educational","major_purchase"]:
        ld.loc[i,"Loan.Purpose"]="cem"
    if ld["Loan.Purpose"][i] in ["home_improvement","medical","vacation","wedding"]:
        ld.loc[i,"Loan.Purpose"]="hmvw"
    if ld["Loan.Purpose"][i] in ["credit_card","house","other","small_business"]:
        ld.loc[i,"Loan.Purpose"]="chos"
    if ld["Loan.Purpose"][i] in ["debt_consolidation","moving"]:
        ld.loc[i,"Loan.Purpose"]="dm"

lp_dummies=pd.get_dummies(ld["Loan.Purpose"],prefix="LP")

lp_dummies.head()

ld=pd.concat([ld,lp_dummies],1)
ld=ld.drop(["Loan.Purpose","LP_renewable_energy"],1)

ld["State"].nunique()

ld=ld.drop(["State"],1)

ld["Home.Ownership"].value_counts()

ld["ho_mort"]=np.where(ld["Home.Ownership"]=="MORTGAGE",1,0)
ld["ho_rent"]=np.where(ld["Home.Ownership"]=="RENT",1,0)
ld=ld.drop(["Home.Ownership"],1)

ld['f1'], ld['f2'] = zip(*ld['FICO.Range'].apply(lambda x: x.split('-', 1)))

ld["fico"]=0.5*(pd.to_numeric(ld["f1"])+pd.to_numeric(ld["f2"]))

ld=ld.drop(["FICO.Range","f1","f2"],1)

ld["Employment.Length"]=ld["Employment.Length"].astype("str")
ld["Employment.Length"]=[x.replace("years","") for x in ld["Employment.Length"]]
ld["Employment.Length"]=[x.replace("year","") for x in ld["Employment.Length"]]

round(ld.groupby("Employment.Length")["Interest.Rate"].mean(),2)

ld["Employment.Length"]=[x.replace("n/a","< 1") for x in ld["Employment.Length"]]
ld["Employment.Length"]=[x.replace("10+","10") for x in ld["Employment.Length"]]
ld["Employment.Length"]=[x.replace("< 1","0") for x in ld["Employment.Length"]]
ld["Employment.Length"]=pd.to_numeric(ld["Employment.Length"],errors="coerce")

ld.dropna(axis=0,inplace=True)

# Modelling 

In [5]:
ld_train, ld_test = train_test_split(ld, test_size = 0.2,random_state=2)

x_train=ld_train.drop(["Interest.Rate","ID","Amount.Funded.By.Investors"],1)
y_train=ld_train["Interest.Rate"]
x_test=ld_test.drop(["Interest.Rate","ID","Amount.Funded.By.Investors"],1)
y_test=ld_test["Interest.Rate"]

## Linear Regression RMSE: 2.05

In [6]:
lm=LinearRegression()

lm.fit(x_train,y_train)

p_test=lm.predict(x_test)

residual=p_test-y_test

rmse_lm=np.sqrt(np.dot(residual,residual)/len(p_test))

print(rmse_lm)

2.0573495540664464


## Decision Tree RMSE: 2.75

In [9]:
dt = DecisionTreeRegressor()

dt.fit(x_train,y_train)

p_test=dt.predict(x_test)

residual=p_test-y_test

rmse_dt=np.sqrt(np.dot(residual,residual)/len(p_test))

print(rmse_dt)

## Decision Tree with Hyperparameter tuning RMSE: 2.07

In [17]:
param_dist = { 'criterion' : ['mse','mae'],
               'max_depth' : [3,5,10,50,None],
               'min_samples_split':[2,5,10,15],
               'min_samples_leaf':[1,5,10,15],
                'max_leaf_nodes' :[5,7,10,16,20,30,45,50,70]
                }

dtree_ht=DecisionTreeRegressor()

grid=GridSearchCV(estimator=dtree_ht, param_grid=param_dist, verbose=1, n_jobs=-1)

grid_result=grid.fit(x_train, y_train)

print(grid_result.best_params_)


dtree_best = DecisionTreeRegressor(criterion='mae', max_depth=10, min_samples_leaf=10, max_leaf_nodes=70, min_samples_split=2)

dtree_best.fit(x_train, y_train)

p_test=dtree_best.predict(x_test)

residual=p_test-y_test

rmse_dt_bt=np.sqrt(np.dot(residual,residual)/len(p_test))

print(rmse_dt_bt)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 2320 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 3958 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 4308 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done 4758 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 5308 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 5958 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 6708 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed:  3.8min finished


{'criterion': 'mae', 'max_depth': 10, 'max_leaf_nodes': 70, 'min_samples_leaf': 10, 'min_samples_split': 2}


## Random Forest RMSE: 1.90

In [23]:
rf=RandomForestRegressor()

rf.fit(x_train, y_train)

p_test=rf.predict(x_test)

residual=p_test-y_test

rmse_rf=np.sqrt(np.dot(residual,residual)/len(p_test))

print(rmse_rf)

## Random Forest with Hyperparameter tuning 

In [None]:
param_dist = { 'criterion' : ['mse','mae'],
              'n_estimators':[10,20,50,100],
               'max_depth' : [3,5,10,50,None],
               'min_samples_split':[2,5,10,15],
               'min_samples_leaf':[1,5,10,15],
                'max_leaf_nodes' :[5,7,10,16,20,30,45,50,70],
                "bootstrap": [True, False]

                }

rf_best=RandomForestRegressor()

grid=RandomizedSearchCV(estimator=rf_best, param_distributions=param_dist, verbose=1, n_jobs=-1, n_iter=100)

grid_result=grid.fit(x_train, y_train)

print(grid_result.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.1min


In [None]:
rf_best=RandomForestRegressor(n_estimators=100, min_samples_split=15, min_samples_leaf=5, max_leaf_nodes=70, max_depth=50, criterion='entropy',
                               class_weight=None, bootstrap=False)

rf_best.fit(x_train, y_train)

p_test=rf_best.predict(x_test)

residual=p_test-y_test

rmse_rf_best=np.sqrt(np.dot(residual,residual)/len(p_test))

print(rmse_rf_best)