# Summarize the cleaned data

- [Tables of descriptive statistics](#Tables-of-descriptive-statistics)

    The feature summaries displayed in earlier notebooks are no longer valid due to the
    data-cleaning performed in those notebooks. This notebook displays feature summaries
    for the cleaned data.

- [Plots of distributions](#Plots-of-distributions)

    As a complement to the summary tables, the notebook also plots the distribution of
    values for each feature (excluding categorical features that have a large number of
    distinct values).

In [None]:
from pathlib import Path

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from IPython.display import SVG, display
from matplotlib.ticker import FuncFormatter, MaxNLocator, PercentFormatter
from pandas import Categorical, Series, to_datetime
from pandas.api.types import is_integer_dtype, is_numeric_dtype, is_string_dtype

from notebook_tools.data_cleaning import (
    convert_acc_loan_data,
    filter_acc_loan_data,
    load_acc_loan_data,
    load_acc_loan_feat_desc,
)
from notebook_tools.feature_exploration import (
    get_group_sizes,
    style_loan_summary,
    summarize_acc_loans,
)
from notebook_tools.plots import format_counts, get_wrapped_caption

In [None]:
loan_data = load_acc_loan_data().pipe(convert_acc_loan_data).pipe(filter_acc_loan_data)

In [None]:
feat_desc = load_acc_loan_feat_desc()

## Tables of descriptive statistics

In [None]:
print(f"\n\nThe number of records is {len(loan_data.index):,d}.")
for dtype in [np.number, "string", "boolean"]:
    summary = summarize_acc_loans(loan_data, dtype, feat_desc)
    print(f"\n\nThe number of features of type {dtype} is {len(summary.index)}.\n\n")
    display(style_loan_summary(summary))

## Plots of distributions

When working in R markdown, I have often used a loop to generate plots showing the
distributions of all features. The plots function as a complement to summary tables such
as those shown in the previous section of the notebook.

When I tried to use this workflow while plotting with seaborn in Jupyter, I found that
memory leakage caused the kernel to die, even when I took care to ensure that all
figures where closed at the end of each loop iteration. For a few years there have been
posts about memory leakage associated with using matplotlib to generate figures in a
loop within Jupyter, and it appears that the only reliable solution is to use a
noninteractive backend for matplotlib.

To apply this workaround, each iteration of a loop that generates plots does the
following:

- Use seaborn to generate a figure.
- Write the figure to disk in SVG format.
- Load the SVG figure from disk as a string.
- Pass the SVG string to functions from `IPython.display` to add the figure to the
notebook.

These steps are encapsulated in the function `display_svg_plot` defined below.

In [None]:
matplotlib.use("svg")
sns.set_theme()

In [None]:
def display_svg_plot(plot):
    temp_svg_path = Path.cwd() / "temp.svg"
    plot.savefig(temp_svg_path, format="svg")
    svg_string = temp_svg_path.read_text()
    display(SVG(data=svg_string))
    plt.close("all")
    temp_svg_path.unlink()

Certain columns contain categorical data with a large number of possible values. Exclude
these columns from the loop that generates plots of distribution.

In [None]:
do_not_plot = ["id", "emp_title", "zip_code"]

Define which type of plot to generate for each feature.

In [None]:
line_plots = [
    "issue_d",
    "earliest_cr_line",
    "last_pymnt_d",
    "next_pymnt_d",
    "last_credit_pull_d",
    "sec_app_earliest_cr_line",
    "hardship_start_date",
    "hardship_end_date",
    "payment_plan_start_date",
    "debt_settlement_flag_date",
    "settlement_date",
]

boolean_plots = ["pymnt_plan", "hardship_flag", "debt_settlement_flag"]

logscale_histograms = [
    "installment",
    "annual_inc",
    "dti",
    "revol_bal",
    "revol_util",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_int",
    "last_pymnt_amnt",
    "annual_inc_joint",
    "tot_cur_bal",
    "mths_since_rcnt_il",
    "total_bal_il",
    "il_util",
    "max_bal_bc",
    "total_rev_hi_lim",
    "avg_cur_bal",
    "bc_open_to_buy",
    "bc_util",
    "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op",
    "mo_sin_rcnt_rev_tl_op",
    "mo_sin_rcnt_tl",
    "mths_since_recent_bc",
    "tot_hi_cred_lim",
    "total_bal_ex_mort",
    "total_bc_limit",
    "total_il_high_credit_limit",
    "revol_bal_joint",
    "settlement_percentage",
]

In [None]:
linear_histograms = [
    column
    for column in loan_data
    if column not in (do_not_plot + line_plots + boolean_plots + logscale_histograms)
]

Define variables used in plot customization.

In [None]:
custom_binwidth = {
    "fico_range_low": 5,
    "fico_range_high": 5,
    "last_fico_range_low": 5,
    "last_fico_range_high": 5,
    "all_util": 2,
    "sec_app_fico_range_low": 5,
    "sec_app_fico_range_high": 5,
    "pct_tl_nvr_dlq": 1,
}

percent_formatting = [
    "pct_tl_nvr_dlq",
    "percent_bc_gt_75",
    "settlement_percentage",
]

angled_labels = [
    "purpose",
    "hardship_reason",
]

custom_ordered_categories = {
    "emp_length": [
        "< 1 year",
        "1 year",
        "2 years",
        "3 years",
        "4 years",
        "5 years",
        "6 years",
        "7 years",
        "8 years",
        "9 years",
        "10+ years",
    ]
}

In [None]:
def plot_distribution(data, column_name, title, caption, **kwargs):
    column_data = data[column_name]

    # If the values are strings, then set the order in which labels will appear along
    # the x-axis.
    if is_string_dtype(column_data.dtype):
        column_data = _convert_to_ordered_categorical(column_data, column_name)

    plot = sns.displot(x=column_data, **kwargs).set(title=title)

    # Tweak tick locations and labels.
    ax = plot.facet_axis(0, 0)
    ax.yaxis.set_major_formatter(FuncFormatter(format_counts))
    if column in percent_formatting:
        ax.xaxis.set_major_formatter(PercentFormatter(decimals=0))
    elif is_numeric_dtype(column_data.dtype) and "log_scale" not in kwargs:
        # Force tick marks to be located on integers.  The call below is equivalent to
        # using matplotlib.ticker.AutoLocator with the constraint that tick marks are
        # integers.
        ax.xaxis.set_major_locator(
            MaxNLocator(nbins="auto", steps=[1, 2, 2.5, 5, 10], integer=True)
        )
        # Add comma separators in formatting tick labels for the x-axis.
        ax.xaxis.set_major_formatter(
            FuncFormatter(
                lambda tick_value, _tick_position: format(int(tick_value), ",")
            )
        )
    if column in angled_labels:
        # The steps to rotate tick labels for the x-axis using matplotlib's
        # object-oriented approach are clumsy, so use pyplot instead.
        plt.xticks(rotation=-45, ha="left")

    display_svg_plot(plot)
    print(f"\n{caption}\n\n\n", flush=True)


def _convert_to_ordered_categorical(col_data, col_name):
    if col_name in custom_ordered_categories:
        ordered_categories = custom_ordered_categories[col_name]
    else:
        ordered_categories = Series(col_data.unique()).dropna().sort_values()
    categorical_data = Categorical(
        col_data, categories=ordered_categories, ordered=True
    )
    categorical_data.name = col_data.name
    return categorical_data

In [None]:
def plot_date_distribution(data, column_name, title, caption, **kwargs):
    to_plot = get_group_sizes(data, group_by=column_name).rename(
        {"count": "Count"}, axis="columns"
    )
    to_plot[column_name] = to_datetime(to_plot[column_name], format="ISO8601")

    kwargs["kind"] = "line"
    if len(to_plot[column_name].index) <= 150:
        kwargs["marker"] = "o"
    plot = sns.relplot(data=to_plot, x=column_name, y="Count", **kwargs).set(
        title=title
    )

    # Tweak formatting of tick labels for the y-axis..
    ax = plot.facet_axis(0, 0)
    ax.yaxis.set_major_formatter(FuncFormatter(format_counts))

    display_svg_plot(plot)
    print(f"\n{caption}\n\n\n", flush=True)

In [None]:
for column in linear_histograms:
    caption = get_wrapped_caption(column, feat_desc, width=200)
    kwargs = {"aspect": 3.3}
    if column in custom_binwidth:
        kwargs["binwidth"] = custom_binwidth[column]
    elif is_integer_dtype(loan_data[column].dtype):
        kwargs["discrete"] = True
    plot_distribution(loan_data, column, f"Distribution of {column}", caption, **kwargs)

In [None]:
for column in logscale_histograms:
    caption = get_wrapped_caption(column, feat_desc, width=200)
    kwargs = {"aspect": 3.3, "log_scale": True, "bins": 50}
    plot_distribution(loan_data, column, f"Distribution of {column}", caption, **kwargs)

In [None]:
for column in boolean_plots:
    caption = get_wrapped_caption(column, feat_desc, width=200)
    kwargs = {"aspect": 3.3}
    mapper = {True: "True", False: "False"}
    plot_distribution(
        loan_data[column].map(mapper).to_frame(),
        column,
        f"Distribution of {column}",
        caption,
        **kwargs,
    )

In [None]:
for column in line_plots:
    caption = get_wrapped_caption(column, feat_desc, width=200)
    kwargs = {"aspect": 3.3}
    plot_date_distribution(
        loan_data, column, f"Distribution of {column}", caption, **kwargs
    )