# Explore correlations

- [Correlations among `year`, `loan_grade`, `purpose`](#Correlations-among-year,-loan_grade,-purpose)

    - Plots presented in this section show distinct differences in the distribution of
    loan grades for different years and loan purposes, but no dramatic patterns.

    - The distribution of loan purposes varies only slightly as a function of year.
    
- [Correlations involving `loan_amnt`](#Correlations-involving-loan_amnt)

    - The largest loans, in the range of \\$36k to \\$40k, have much better loan grades
    than smaller loans.
    
    - Loans in the range of \\$5k to \\$11k have somewhat better grades than other loans
    less than \\$36k.

    - The year 2018 shows a distinct change in the distribution of loan amounts.  Loans
    amounts that are multiple of \\$5k become more frequent, as do loans of \\$36k and
    above.

This notebook presents initial exploration of correlations involving selected features
in the loan data.

Later notebooks present in-depth analysis of particular features, e.g., `int_rate`,
including correlations involving those features.

In [None]:
import string

import numpy as np
import pandas as pd
import plotly.express as px
from IPython.display import display

import notebook_tools.database as db
from notebook_tools.derived_features import get_year
from notebook_tools.feature_exploration import (
    get_group_sizes,
    get_value_counts,
    style_value_counts,
)

In [None]:
loan_data = db.get_loan_data()
loan_metadata = db.get_loan_metadata()

In [None]:
loan_data["year"] = get_year(loan_data, "issue_d")

## Correlations among `year`, `loan_grade`, `purpose`

### Distributions of individual features

In [None]:
year_counts = get_value_counts(loan_data["year"])
display(style_value_counts(year_counts))

In [None]:
loan_data.groupby(by=["grade", "sub_grade"]).size().to_frame(name="count")

In [None]:
purpose_counts = get_value_counts(loan_data["purpose"])
display(style_value_counts(purpose_counts))

In [None]:
to_plot = get_group_sizes(loan_data, group_by="issue_d")
fig = px.line(
    to_plot,
    x="issue_d",
    y="count",
    markers=True,
    labels={"issue_d": "Loan date", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by date",
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by="year")
fig = px.bar(
    to_plot,
    x="year",
    y="count",
    labels={"year": "Year", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by year",
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by="grade")
fig = px.bar(
    to_plot,
    x="grade",
    y="count",
    labels={"grade": "Loan grade", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by loan grade",
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by=["grade", "sub_grade"])
to_plot["sub_grade"] = to_plot["sub_grade"].str[1]
fig = px.bar(
    to_plot,
    x="grade",
    y="count",
    color="sub_grade",
    labels={
        "grade": "Loan grade",
        "count": "Number of loans",
        "sub_grade": "Sub-grade",
    },
    hover_data={"count": ":.3s"},
    title="Number of loans by loan grade and sub-grade",
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by="purpose").sort_values(
    "count", ascending=False
)

# Save an ordered array of the loan purposes for use in later plotting.
ordered_loan_purposes = list(to_plot["purpose"])

fig = px.bar(
    to_plot,
    x="purpose",
    y="count",
    labels={"purpose": "Loan purpose", "count": "Number of loans"},
    hover_data={"count": ":.3s"},
    title="Number of loans by purpose",
)
fig.show()

### Correlations between features

In [None]:
to_plot = get_group_sizes(loan_data, group_by=["year", "grade"])
fig = px.histogram(
    to_plot,
    x="year",
    y="count",
    color="grade",
    barnorm="fraction",
    category_orders={"grade": list(string.ascii_uppercase[:7])},
    labels={"year": "Year", "count": "Number of loans", "grade": "Grade"},
    title="Distribution of loan grade by year",
    height=400,
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()

For a simpler view of correlations involving the loan grade, map the grades to numbers
and calculate the mean grade for different groups.

In [None]:
loan_grade_mapper = {"A": 7, "B": 6, "C": 5, "D": 4, "E": 3, "F": 2, "G": 1}

In [None]:
to_plot = loan_data[["year", "grade"]]
to_plot = (
    to_plot.assign(grade=to_plot["grade"].map(loan_grade_mapper))
    .groupby("year")
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="year",
    y="grade",
    labels={"year": "Year", "grade": "Mean numeric grade"},
    title="Mean numeric grades by year (A=7, B=6, C=5, ..., G=1)",
    hover_data={"grade": ":.2f"},
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by=["year", "purpose"])
fig = px.histogram(
    to_plot,
    x="year",
    y="count",
    color="purpose",
    barnorm="fraction",
    category_orders={"purpose": ordered_loan_purposes},
    labels={"year": "Year", "count": "Number of loans", "purpose": "Purpose"},
    title="Distribution of loan purpose by year",
    height=500,
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=".2p", title_text="Percentage of loans")
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by=["purpose", "grade"])
fig = px.histogram(
    to_plot,
    x="purpose",
    y="count",
    color="grade",
    barnorm="fraction",
    category_orders={
        "purpose": ordered_loan_purposes,
        "grade": list(string.ascii_uppercase[:7]),
    },
    labels={"purpose": "Loan purpose", "count": "Number of loans", "grade": "Grade"},
    title="Distribution of loan grade by loan purpose",
    height=400,
)


def clean_up_hovertemplate(trace):
    trace.hovertemplate = trace.hovertemplate.replace("%{y}", "%{y:.3p}").replace(
        "sum of Number of loans (normalized as fraction)", "Percentage"
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()

In [None]:
to_plot = loan_data[["purpose", "grade"]]
to_plot = (
    to_plot.assign(grade=to_plot["grade"].map(loan_grade_mapper))
    .groupby("purpose")
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="purpose",
    y="grade",
    category_orders={
        "purpose": ordered_loan_purposes,
    },
    labels={"purpose": "Loan purpose", "grade": "Mean numeric grade"},
    title="Mean numeric grades by loan purpose (A=7, B=6, C=5, ..., G=1)",
    hover_data={"grade": ":.2f"},
)
fig.show()

Conclusions:

- The distribution of loan grades shows distinct variation for different years and loan
purposes, but no dramatic patterns.
- The distribution of loan purposes varies only slightly as a function of year.

## Correlations involving `loan_amnt`

For large data sets, the binning of histogram data should be done outside of plotly. The
reason is that plotly does binning in JavaScript, and so unbinned data passed to
plotly's histogram function becomes part of the javascript code stored with the
notebook.  For the current data set, this can increase the notebook size on disk by a
factor of more than 100.

In [None]:
min = loan_data["loan_amnt"].min()
max = loan_data["loan_amnt"].max()
print(
    'The minimum and maximum values of "loan_amnt" '
    f"are ${min:,} and ${max:,}, respectively."
)

In [None]:
# Define arrays / lists needed for binning the histogram and plotting the bins in
# plotly.  Use &#36; in place of $ in order to avoid triggering math formatting.
loan_amnt_bins = np.linspace(1e3, 41e3, num=41)
loan_amnt_bin_labels = [f"[&#36;{left:d}k - &#36;{left+1:d}k)" for left in range(1, 41)]
loan_amnt_tick_vals = loan_amnt_bin_labels[4::5]
loan_amnt_tick_text = [f"&#36;{left:d}k" for left in range(5, 45, 5)]

In [None]:
loan_data["loan_amnt_bin"] = pd.cut(
    loan_data["loan_amnt"],
    bins=loan_amnt_bins,
    labels=loan_amnt_bin_labels,
    right=False,
)

In [None]:
to_plot = get_group_sizes(loan_data, group_by="loan_amnt_bin")
fig = px.bar(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    labels={"count": "Number of loans", "loan_amnt_bin": "Loan amount"},
    title="Distribution of loan amount",
)
hovertemplate = "Loan amount=%{customdata}<br>Number of loans=%{y:.3s}<extra></extra>"
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "grade"])
fig = px.histogram(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    color="grade",
    barnorm="fraction",
    category_orders={"grade": list(string.ascii_uppercase[:7])},
    labels={
        "loan_amnt_bin": "Loan amount",
        "count": "Number of loans",
        "grade": "Grade",
    },
    title="Distribution of loan grade by loan amount",
)


def clean_up_hovertemplate(trace):
    trace.customdata = loan_amnt_bin_labels
    trace.hovertemplate = (
        trace.hovertemplate.replace("%{x}", "%{customdata}")
        .replace("%{y}", "%{y:.3p}")
        .replace("sum of Number of loans (normalized as fraction)", "Percentage")
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()

In [None]:
to_plot = loan_data[["loan_amnt_bin", "grade"]]
to_plot = (
    to_plot.assign(grade=to_plot["grade"].map(loan_grade_mapper))
    .groupby("loan_amnt_bin", observed=False)
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="loan_amnt_bin",
    y="grade",
    labels={
        "loan_amnt_bin": "Loan amount",
        "grade": "Mean numeric grade",
    },
    title="Mean numeric grades by loan amount (A=7, B=6, C=5, ..., G=1)",
)
hovertemplate = (
    "Loan amount=%{customdata}<br>Mean numeric grade=%{y:.2f}<extra></extra>"
)
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "year"])
fig = px.histogram(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    color="year",
    barnorm="fraction",
    labels={
        "loan_amnt_bin": "Loan amount",
        "count": "Number of loans",
        "year": "Year",
    },
    title="Distribution of loan year by loan amount",
)


def clean_up_hovertemplate(trace):
    trace.customdata = loan_amnt_bin_labels
    trace.hovertemplate = (
        trace.hovertemplate.replace("%{x}", "%{customdata}")
        .replace("%{y}", "%{y:.3p}")
        .replace("sum of Number of loans (normalized as fraction)", "Percentage")
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()

In [None]:
encoded_year_mapper = {
    "2012": 1,
    "2013": 2,
    "2014": 3,
    "2015": 4,
    "2016": 5,
    "2017": 6,
    "2018": 7,
}
to_plot = loan_data[["loan_amnt_bin", "year"]]
to_plot = (
    to_plot.assign(year=to_plot["year"].map(encoded_year_mapper))
    .groupby("loan_amnt_bin", observed=False)
    .mean()
    .reset_index()
)
fig = px.bar(
    to_plot,
    x="loan_amnt_bin",
    y="year",
    labels={
        "loan_amnt_bin": "Loan amount",
        "year": "Mean encoded year",
    },
    title="Mean encoded year by loan amount (2018=7, 2017=6, 2015=5, ..., 2012=1)",
)
hovertemplate = "Loan amount=%{customdata}<br>Mean encoded year=%{y:.2f}<extra></extra>"
fig.update_traces(customdata=loan_amnt_bin_labels, hovertemplate=hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.show()

In [None]:
to_plot = get_group_sizes(loan_data, group_by=["loan_amnt_bin", "purpose"])
fig = px.histogram(
    to_plot,
    x="loan_amnt_bin",
    y="count",
    color="purpose",
    barnorm="fraction",
    category_orders={"purpose": ordered_loan_purposes},
    labels={
        "loan_amnt_bin": "Loan amount",
        "count": "Number of loans",
        "purpose": "Purpose",
    },
    title="Distribution of loan purpose by loan amount",
    height=500,
)


def clean_up_hovertemplate(trace):
    trace.customdata = loan_amnt_bin_labels
    trace.hovertemplate = (
        trace.hovertemplate.replace("%{x}", "%{customdata}")
        .replace("%{y}", "%{y:.3p}")
        .replace("sum of Number of loans (normalized as fraction)", "Percentage")
    )


fig.for_each_trace(clean_up_hovertemplate)
fig.update_layout(bargap=0)
fig.update_xaxes(
    tickmode="array", tickvals=loan_amnt_tick_vals, ticktext=loan_amnt_tick_text
)
fig.update_yaxes(tickformat=",.2p", title_text="Percentage of loans")
fig.show()

Conclusions:

- The largest loans, in the range of \\$36k to \\$40k, have much better loan grades than
smaller loans.
- Loans in the range of \\$5k to \\$11k have somewhat better grades than other loans
less than \\$36k.
- The year 2018 shows a distinct change in the distribution of loan amounts.  Loans
amounts that are multiple of \\$5k become more frequent, as do loans of \\$36k and
above.
