# Credit Risk Evaluator

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

## Retrieve the Data

The data is located in the Challenge Files Folder:

* `lending_data.csv`

Import the data using Pandas. Display the resulting dataframe to confirm the import was successful.

In [2]:
# Import the data


df = pd.read_csv(Path('Resources/lending_data.csv'))
df.head

<bound method NDFrame.head of        loan_size  interest_rate  borrower_income  debt_to_income  \
0        10700.0          7.672            52800        0.431818   
1         8400.0          6.692            43600        0.311927   
2         9000.0          6.963            46100        0.349241   
3        10700.0          7.664            52700        0.430740   
4        10800.0          7.698            53000        0.433962   
...          ...            ...              ...             ...   
77531    19100.0         11.261            86600        0.653580   
77532    17700.0         10.662            80900        0.629172   
77533    17600.0         10.595            80300        0.626401   
77534    16300.0         10.068            75300        0.601594   
77535    15600.0          9.742            72300        0.585062   

       num_of_accounts  derogatory_marks  total_debt  loan_status  
0                    5                 1       22800            0  
1                

In [3]:
X = df.drop("loan_status", axis=1)
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [4]:
y = df["loan_status"]


In [5]:
test_df = pd.read_csv(Path('Resources/lending_data.csv'))
test_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [6]:
train_df = pd.read_csv(Path('Resources/lending_data.csv'))
train_df.head()


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
44602,9500.0,7.155,47900,0.373695,4,0,17900
27125,8300.0,6.635,43000,0.302326,2,0,13000
41462,9900.0,7.342,49700,0.396378,4,0,19700
30762,10200.0,7.444,50600,0.407115,4,1,20600
12669,8500.0,6.734,44000,0.318182,3,0,14000


In [8]:
# using the train test split function
X_train, X_test,y_train, y_test = train_test_split(X,y ,
                                   random_state=104, 
                                   test_size=0.25, 
                                   shuffle=True)
  
 # printing out train and test sets
  
print('X_train : ')
print(X_train.head())
print('')
print('X_test : ')
print(X_test.head())
print('')
print('y_train : ')
print(y_train.head())
print('')
print('y_test : ')
print(y_test.head())

X_train : 
       loan_size  interest_rate  borrower_income  debt_to_income  \
71302    10400.0          7.524            51400        0.416342   
23642     9600.0          7.216            48500        0.381443   
42137     9700.0          7.252            48800        0.385246   
11801    10700.0          7.655            52600        0.429658   
30431    12400.0          8.388            59500        0.495798   

       num_of_accounts  derogatory_marks  total_debt  
71302                4                 1       21400  
23642                4                 0       18500  
42137                4                 0       18800  
11801                5                 1       22600  
30431                6                 1       29500  

X_test : 
       loan_size  interest_rate  borrower_income  debt_to_income  \
49860     9500.0          7.152            47900        0.373695   
21827     7300.0          6.243            39300        0.236641   
38589     8200.0          6.596    

In [9]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

classifier.fit(X_train, y_train)
classifier

Training Data Score: 0.991900536524969
Testing Data Score: 0.991642591828312


LogisticRegression()

In [10]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [11]:

classifier = LogisticRegression(max_iter=1000)
classifier

LogisticRegression(max_iter=1000)

In [12]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.9972829825285459
Testing Score: 0.991642591828312


In [13]:
# Train a Logistic Regression model and print the model score
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")


Training Data Score: 0.991900536524969
Testing Data Score: 0.991642591828312


In [14]:
print(f'Testing Score: {clf.score(X_test, y_test)}')

Testing Score: 0.991642591828312


## Predict Model Performance

You will be creating and comparing two models on this data: a Logistic Regression, and a Random Forests Classifier. Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! 

Write down your prediction in the designated cells in your Jupyter Notebook, and provide justification for your educated guess.

*Replace the text in this markdown cell with your predictions, and be sure to provide justification for your guess.*

## Split the Data into Training and Testing Sets

In [None]:
# Split the data into X_train, X_test, y_train, y_test


## Create, Fit and Compare Models

Create a Logistic Regression model, fit it to the data, and print the model's score. Do the same for a Random Forest Classifier. You may choose any starting hyperparameters you like. 

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the designated markdown cell.

In [None]:
# Train a Random Forest Classifier model and print the model score


*Which model performed better? How does that compare to your prediction? Replace the text in this markdown cell with your answers to these questions.*