In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_predict
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv(r"C:\Users\linds\Documents\git\loan_history_almost_clean.csv")
df.head(5)

In [None]:
len(df)

In [None]:
#Check for Null values
df.isnull().values.any()

In [None]:
# WHat columns have null values?
df.isnull().sum()

In [None]:
# With only one null, it can be dropped
df = df.dropna()

In [None]:
len(df)

## Data must be converted to numerical, mapping will be used in this instance

In [None]:
# map credit rating to a number
int_credit_rating = {"credit_rating": {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 5, "G": 5}}
df = df.replace(int_credit_rating)
df.head()

### Logistic Regression

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('default', axis=1), df['default'], test_size=0.2, random_state=42)

# Create a logistic regression model with L1 regularization
lr_model = LogisticRegression(penalty='l1', solver='liblinear')

param_grid = {'C': [0.1, 1, 10, 100],
              'class_weight': [None, 'balanced'],
              'max_iter': [100, 500, 1000]}

# Perform grid search using 5-fold cross-validation
lr_grid = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
lr_grid.fit(X_train, y_train)

# Print the best hyperparameters and corresponding test accuracy
print("Best hyperparameters: ", lr_grid.best_params_)
print("Test accuracy: ", lr_grid.score(X_test, y_test))

In [None]:
# Fit the model on the training set
lr_model.fit(X_train, y_train)

# Predict the classes of the testing set
y_pred = lr_model.predict(X_test)

# Calculate the test accuracy
accuracy = lr_model.score(X_test, y_test)
print(f'Test accuracy: {accuracy:.2f}')

# display the confusion matrix
#
# TP FP
# FN TN
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
#Correlation Matrix
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(),annot=True)

## Not great accuracy, can another model do better?

In [None]:
# Setting up multiple regression moels
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

# vanilla linear regression
from sklearn.linear_model import LinearRegression
# decision tree
from sklearn.tree import DecisionTreeRegressor
# random forest
from sklearn.ensemble import RandomForestRegressor
# gradient boost
from sklearn.ensemble import GradientBoostingRegressor
# elastic net
from sklearn.linear_model import ElasticNet
# stochastic gradient descent regression
from sklearn.linear_model import SGDRegressor
# support vector machine
from sklearn.svm import SVR
# bayesian ridge regression
from sklearn.linear_model import BayesianRidge
# kernel ridge
from sklearn.kernel_ridge import KernelRidge
# xgboost
#from xgboost.sklearn import XGBRegressor
# LGBM
#from lightgbm import LGBMRegressor

# measuring how we do 
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# scaler
from sklearn.preprocessing import StandardScaler

In [None]:
# Define the input features and target variable
X = df[['amount_of_loan', 'credit_rating', 'years_with_bank', 'rent_mortgage_own', 'income', 'age']]
y = df['default']

In [None]:
# create a list of regression models to try
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(),
ElasticNet(), SGDRegressor(), SVR(), BayesianRidge(), KernelRidge()]
models

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
# loop through each model and train on the same training data
print('We want low MSE and MAE and an R-squared as close to 1 as possible')
for model in models:
    print('\n'+ type(model).__name__)
    model.fit(X_train, y_train.ravel())

    # make prediction on test set
    y_pred = model.predict(X_test)
    
    # report effectiveness of model
    # use MSE, MAE and R-2
    # (no confusion matrix, because we are predicting in a range and not classifying to a set)
    print('MSE: {0:.4f}'.format(mean_squared_error(y_test, y_pred)))
    print('MAE: {0:.4f}'.format(mean_absolute_error(y_test, y_pred)))
    print('R-squared: {0:.4f}'.format(r2_score(y_test, y_pred)))

## Random Forest Regressor is displaying the best results, exploring further

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=13)
# Define the model
forest = RandomForestClassifier()

In [None]:
#Train the model
forest.fit(X_train, y_train)

In [None]:
# Evaluate the model
forest_scores = cross_val_score(forest, X_train, y_train, cv=5, scoring='accuracy')

In [None]:
# Print the score
print('Random Forest scores:', forest_scores.mean())

## It's not great accuracy, but cross validation can be done

In [None]:
# Define the forest parameter grid search
forest_param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [None, 5, 10],
              'min_samples_split': [2, 5],
              'min_samples_leaf': [1, 2]}

In [None]:
#Find best hyperparameters, 5 folds
forest_grid = GridSearchCV(forest, forest_param_grid, cv=5, scoring='accuracy')
forest_grid.fit(X_train, y_train)

In [None]:
# Display best parameters and score
print('Best hyperparameters:', forest_grid.best_params_)
print('Best score:', forest_grid.best_score_)

In [None]:
# Get the feature importances
importances = forest.feature_importances_

# Create a list of feature names
feature_names = ['amount_of_loan', 'credit_rating', 'years_with_bank', 'rent_mortgage_own', 'income', 'age']

# Print the feature importances
for feature_name, importance in zip(feature_names, importances):
    print(f'{feature_name}: {importance:.3f}')

In [None]:
# Create a bar chart of feature importances
plt.bar(feature_names, importances)

# Define a list of colors for each feature
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

# Add labels and title
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importances', color=colors)

# Rotate x-axis labels for readability
plt.xticks(rotation=45)

fig.tight_layout()

# Show the plot
plt.show()