# Credit Risk Evaluator

In [28]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [29]:
# Import the data
df = pd.read_csv('Resources/lending_data.csv')

## Predict Model Performance


In [30]:
# Split the data into X_train, X_test, y_train, y_test
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

model = LogisticRegression()
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
# Train a Linear Regression model and print the model score
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)
print('Linear Regression:')
print(f"Training Model Score: {model.score(X_train, y_train)}")
print(f"Testing Model Score: {model.score(X_test, y_test)}")
print(f'Score Delta: {model.score(X_train, y_train)-model.score(X_test, y_test)}')

Linear Regression:
Training Model Score: 0.9921240885954051
Testing Model Score: 0.9918489475856377
Score Delta: 0.0002751410097674434


In [33]:
# Train a Random Forest Classifier model and print the model score

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print('RandomForestClassifier')
print(f'Training Classifier Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Classifier Score: {clf.score(X_test_scaled, y_test)}')
print(f'Score Delta: {clf.score(X_train_scaled, y_train)-clf.score(X_test_scaled, y_test)}')

RandomForestClassifier
Training Classifier Score: 0.9975409272252029
Testing Classifier Score: 0.9917457697069748
Score Delta: 0.005795157518228122


The Logistic Regression model performed better only because it's less computationally expensive. The Logistic Regression model was able to run the model 13 seconds quicker than the Random Forest Classifier. 