In [1]:
import matplotlib.pyplot as plt_func

In [2]:
def removeColumnsFromList(df, columnsToRemove):
    """
    Return a list of columns names excluding the names in the list 
    `columnsToKeep`.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame used to produce the list of column names. 
        
        columnsToRemove: iterable
            An iterable object that has the names as elements that
            will be excluded from the returned list.
    Returns:
        list: The aforementioned column names.
    """
    columns = df.columns.tolist()
    for column in columnsToRemove:
        try:
            columns.remove(column)
        except ValueError as err:
            if not 'list.remove(x): x not in list' in str(err):
                raise

    return columns

<p>We could easily reproduce these type of plots <a href="https://www.statsmodels.org/dev/examples/notebooks/generated/regression_plots.html">very closely</a>, with <a href="http://www.statsmodels.org/dev/examples/notebooks/generated/regression_diagnostics.html">additional regression diagnostics</a>, using the <code>statsmodels</code> library, however I would like to have more control, so I decided to plot most of them manually using <code>matplotlib</code>.

In [3]:
def createResidualPlots(df_X, df_Y, fitted_model, list_of_indices=[], width=7, height=3):
    """
    This function returns various residual plots for the fitted model.
    
    For linear regressions, the first two plots are plots of the 
    residuals and the square root of the absolute standardized residuals
    vs the predictor. For the multiple regression fit, we instead plots 
    the residuals and the square root of the absolute standardized 
    residuals vs the fitted values. The third plot is a QQ plot of the
    quantiles of the standardized residuals vs the quantiles of the 
    normal distribution, and a 45-degree reference line is also plotted 
    for comparison (see also 
    https://seankross.com/2016/02/29/A-Q-Q-Plot-Dissection-Kit.html). The 
    final plot is a leverage plot of the standardized residuals.
    
    Args:
        df_X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of independent variables
            (including a column for the 'Intercept' set equal to one).
            Each row in the DataFrame represents an individual sample 
            point, with the successive columns corresponding to the 
            independent variables and their specific values for that 
            sample point. 
        df_Y: pandas.core.frame.DataFrame or pandas.core.series.Series
            This should be a pandas Series of DataFrame of one column,
            which holds the data of the dependent variable.        
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model
            that has been fitted with df_X and df_Y.
        list_of_indices: list, default list()
            A list that hold indices indicating which data point(s) want to 
            be colored differently to distinguish those point(s) from the 
            rest of the data.
        width: float, default 7
            The width of each subplot.
        height: float, default 3
            The height of each subplot.
    """ 
    descriptiveColumns = df_X.columns.tolist()
    try:
        descriptiveColumns.remove('Intercept')
    except ValueError as err:
        if not 'list.remove(x): x not in list' in str(err):
            raise
            
    assert len(descriptiveColumns) >= 1, f'descriptiveColumns = {descriptiveColumns}'
    approach = 'simple' if len(descriptiveColumns) == 1 else 'multivariable'
    
    sr_Y_hat = fitted_model.predict(df_X)
    residual = np.squeeze(df_Y.to_numpy()) - sr_Y_hat
    from sklearn import preprocessing
    standardized_residual = preprocessing.scale(residual)
    
    X = df_X.to_numpy()
    try:
        H = X @ np.linalg.inv(X.transpose() @ X) @ X.transpose()
        leverage = H.diagonal() 
    except np.linalg.LinAlgError as err:
        if not 'Singular matrix' in str(err):
            raise
            
        leverage = None
    
    numberOfSubplots = 3 if leverage is None else 4
    fig, axes = plt_func.subplots(numberOfSubplots, 1, constrained_layout=True, figsize=(width, height*numberOfSubplots))
    if approach == 'simple':
        descriptive = descriptiveColumns[0]
        X_plot = df_X[descriptive].to_numpy()
    else:
        descriptive = 'fitted values'
        X_plot = sr_Y_hat.to_numpy()
        
    mask_special_indices = np.zeros(residual.shape[0], dtype=bool)
    mask_special_indices[list_of_indices] = True
    
    from matplotlib import colors
    default_colors = plt_func.rcParams['axes.prop_cycle'].by_key()['color']
    cmap = colors.ListedColormap(default_colors[:2])
    
    _ = axes[0].scatter(x=X_plot, y=residual, c=mask_special_indices, cmap=cmap)
    _ = axes[0].set_xlabel(descriptive)
    _ = axes[0].set_ylabel('residuals')
    _ = axes[0].set_title(f'residual plot for the linear regression')
    
    _ = axes[1].scatter(x=X_plot, y=np.absolute(standardized_residual)**0.5, c=mask_special_indices, cmap=cmap)
    _ = axes[1].set_xlabel(descriptive)
    _ = axes[1].set_ylabel(r'$\sqrt{\left|\mathrm{standardized \,\,\, residuals}\right|}$')
    _ = axes[1].set_title(r'$\sqrt{\left|\mathrm{standardized \,\,\, residuals}\right|}$ for the linear regression')
        
    n = sr_Y_hat.shape[0] + 1
    q_list = np.linspace(start=1/n, stop=1, num=n)
    quantiles_data = np.sort(standardized_residual)
    from scipy import stats
    quantiles_theoretical = stats.norm.ppf(q_list)[:-1]  # remove infinity from array
    _ = axes[2].scatter(x=quantiles_theoretical, y=quantiles_data, c=mask_special_indices, cmap=cmap)
    x_min, x_max = axes[2].get_xlim()
    y_min, y_max = axes[2].get_ylim()
    axes[2].plot((x_min, x_max), (y_min, y_max), color='black', label='45-degree line')
    _ = axes[2].set_xlabel('normal distribution quantiles')
    _ = axes[2].set_ylabel('standardized residuals quantiles')
    _ = axes[2].set_title('normal qq plot')
    _ = axes[2].legend()
    
    if not leverage is None:
        _ = axes[3].scatter(x=leverage, y=standardized_residual, c=mask_special_indices, cmap=cmap)
        _ = axes[3].set_xlabel('leverage')
        _ = axes[3].set_ylabel('standardized residuals')
        _ = axes[3].set_title(f'standardized residuals vs leverage')

In [4]:
def createSimpleLinearRegressionPlot(df_X, df_Y, fitted_model, alpha=0.05, width=8, height=3):
    """
    This function returns a scatter plot of the response and the predictor
    of a simple linear regression. Furthermore, the least squares regression
    line is shown with an associated confidence and prediction interval of 
    1-alpha.
    
    Args:
        df_X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of the independent variable.
            Each row in the DataFrame represents an individual sample point.
        df_Y: pandas.core.frame.DataFrame
            This should be a DataFrame of one column,
            which holds the data of the dependent variable.            
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model
            that has been fitted with df_X and df_Y.
        alpha: float
            This prediction and condidence intervals that are being shown are
            of 1-alpha (e.g., alpha=0.05 corresponds to 95% confidence).
        width: float, default 8
            The width of the plot.
        height: float, default 3
            The height of the plot.        
    """ 
    
    assert df_Y.shape[1] == 1
    dependent = df_Y.columns[0]
    descriptiveColumns = df_X.columns.tolist()
    try:
        descriptiveColumns.remove('Intercept')
        contains_intercept = True
    except ValueError as err:
        if not 'list.remove(x): x not in list' in str(err):
            raise

        contains_intercept = False
        
    independent = descriptiveColumns[0]
    independent_arr = np.linspace(start=df_X[independent].min(), stop=df_X[independent].max(), num=df_X.shape[0])
    if contains_intercept:
        df_X_pred = pd.DataFrame({
            'Intercept': np.ones(shape=(df_X.shape[0], ), dtype=int), 
            independent: independent_arr,
        }, columns = ['Intercept', independent])
    else:
        df_X_pred = pd.DataFrame({independent: independent_arr})
        
    sr_Y_pred = fitted_model.predict(df_X_pred)
    
    # get prediction intervals
    try:
        from statsmodels.sandbox.regression.predstd import wls_prediction_std
        std_err_prediction, lower_pred_int, upper_pred_int = wls_prediction_std(fitted_model, exog=df_X_pred, alpha=alpha)
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise

        std_err_prediction, lower_pred_int, upper_pred_int = None, None, None

    # get confidence intervals
    try:
        result = fitted_model.get_prediction(df_X_pred)
        conf_int = result.conf_int(alpha=alpha)
        lower_conf_int, upper_conf_int = conf_int[:, 0], conf_int[:, 1]
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise   

        lower_conf_int, upper_conf_int = None, None
        
    fig, ax = plt_func.subplots(constrained_layout=True, figsize=(width, height))
    _ = ax.scatter(df_X[independent], df_Y, label='training data')
    _ = ax.plot(df_X_pred[independent], sr_Y_pred, '-', color='darkorchid', linewidth=2, label='prediction')
    if not lower_conf_int is None and not upper_conf_int is None:
        _ = ax.fill_between(df_X_pred[independent], lower_conf_int, upper_conf_int, color='#888888', alpha=0.4, label=f"confidence interval ({int((1-alpha)*100)}%)")
    if not lower_pred_int is None and not upper_pred_int is None:
        _ = ax.fill_between(df_X_pred[independent], lower_pred_int, upper_pred_int, color='#888888', alpha=0.1, label=f"prediction interval ({int((1-alpha)*100)}%)")

    _ = ax.legend()
    _ = ax.set_xlabel(independent)
    _ = ax.set_ylabel(dependent)
    _ = ax.set_title(f'regression of prediction vs training data')
    _ = ax.grid(True)

In [5]:
def createSimpleLinearRegressionPlotWithTransformation(df_X, df_Y, fitted_model, df_independent, alpha=0.05, width=8, height=3):
    """
    This function returns a scatter plot of the response and the predictor
    of a simple linear regression with a transformed variable. Furthermore, 
    the least squares regression line is shown with an associated confidence 
    and prediction interval of 1-alpha.
    
    Args:
        df_X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of the independent variable.
            Each row in the DataFrame represents an individual sample point.
        df_Y: pandas.core.frame.DataFrame
            This should be a DataFrame of one column,
            which holds the data of the dependent variable.
        df_independent: pd.core.frame.DataFrame
            This DataFrame should hold the data of the independent variable before 
            the transformation has been applied to the variable, and will be 
            used to plot the regression instead of the transformed variable in
            the DataFrame df_X.
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model
            that has been fitted with df_X and df_Y.
        alpha: float
            This prediction and condidence intervals that are being shown are
            of 1-alpha (e.g., alpha=0.05 corresponds to 95% confidence).
        width: float, default 8
            The width of the plot.
        height: float, default 3
            The height of the plot.        
    """ 
    
    assert df_Y.shape[1] == 1
    dependent = df_Y.columns[0]
    descriptiveColumns = df_X.columns.tolist()
    try:
        descriptiveColumns.remove('Intercept')
        contains_intercept = True
    except ValueError as err:
        if not 'list.remove(x): x not in list' in str(err):
            raise

        contains_intercept = False  
    
    independent = descriptiveColumns[0]
    independent_arr = np.linspace(start=df_X[independent].min(), stop=df_X[independent].max(), num=df_X.shape[0])
    if contains_intercept:
        df_X_pred = pd.DataFrame({
            'Intercept': np.ones(shape=(df_X.shape[0], ), dtype=int), 
            independent: independent_arr,
        }, columns = ['Intercept', independent])
    else:
        df_X_pred = pd.DataFrame({independent: independent_arr})
    
    sr_Y_pred = fitted_model.predict(df_X_pred)

    # get prediction intervals
    try:
        from statsmodels.sandbox.regression.predstd import wls_prediction_std
        std_err_prediction, lower_pred_int, upper_pred_int = wls_prediction_std(fitted_model, exog=df_X_pred, alpha=alpha)
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise

        std_err_prediction, lower_pred_int, upper_pred_int = None, None, None

    # get confidence intervals
    try:
        result = fitted_model.get_prediction(df_X_pred)
        conf_int = result.conf_int(alpha=alpha)
        lower_conf_int, upper_conf_int = conf_int[:, 0], conf_int[:, 1]
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise   

        lower_conf_int, upper_conf_int = None, None

    independent = df_independent.columns[0]
#     df_X = pd.concat([df_X, df_independent], axis=1)
    df_X_pred[independent] = np.linspace(start=df_independent.min(), stop=df_independent.max(), num=df_independent.shape[0])
    
    fig, ax = plt_func.subplots(constrained_layout=True, figsize=(width, height))
    _ = ax.scatter(df_independent, df_Y, label='pre-transformed training data')
    _ = ax.plot(df_X_pred[independent], sr_Y_pred, '-', color='darkorchid', linewidth=2, label='prediction')
    if not lower_conf_int is None and not upper_conf_int is None:
        _ = ax.fill_between(df_X_pred[independent], lower_conf_int, upper_conf_int, color='#888888', alpha=0.4, label=f"confidence interval ({int((1-alpha)*100)}%)")
    if not lower_pred_int is None and not upper_pred_int is None:
        _ = ax.fill_between(df_X_pred[independent], lower_pred_int, upper_pred_int, color='#888888', alpha=0.1, label=f"prediction interval ({int((1-alpha)*100)}%)")

    _ = ax.legend()
    _ = ax.set_xlabel(independent)
    _ = ax.set_ylabel(dependent)
    _ = ax.set_title('regression of prediction vs pre-transformed training data')
    _ = ax.grid(True)

In [6]:
def createPolynomialLinearRegressionPlot(df_X, df_Y, fitted_model, polynomialMap, independent='independent', alpha=0.05, width=8, height=3):
    """
    This function returns a scatter plot of the response and the predictor
    of a polynomial regression with one independent variable. Furthermore, 
    the least squares regression line is shown with an associated confidence 
    and prediction interval of 1-alpha.
    
    Args:
        df_X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of the exponentiated variable
            used for the polynomial regression. Each row in the DataFrame 
            represents an individual sample point.
        df_Y: pandas.core.frame.DataFrame
            This should be a DataFrame of one column,
            which holds the data of the dependent variable.            
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model
            that has been fitted with df_X and df_Y.
        polynomialMap: dict
            This argument is used when plotting a polynomial regression. It is a 
            dictionary that must contain the column names of the DataFrame X 
            (excluding the intercept if the model is fitted with an intercept) as
            keys with their associated polynomial degrees as values. For instance,
            let us imagine that the model being fitted takes the form
                y = a + b*x^2 + c*x^5
            then the polynomialMap argument should be
                mapping_powers = {
                    'b': 2,
                    'c': 5,
                }
        independent: str, default 'independent'
            A string describing the independent variable of the regression. This 
            argument is only used when plotting a polynomial regression without
            a term of degree 1.
        alpha: float
            This prediction and condidence intervals that are being shown are
            of 1-alpha (e.g., alpha=0.05 corresponds to 95% confidence).
        width: float, default 8
            The width of the plot.
        height: float, default 3
            The height of the plot.        
    """ 
    
    assert df_Y.shape[1] == 1
    dependent = df_Y.columns[0]
    descriptiveColumns = df_X.columns.tolist()
    try:
        descriptiveColumns.remove('Intercept')
        contains_intercept = True
    except ValueError as err:
        if not 'list.remove(x): x not in list' in str(err):
            raise

        contains_intercept = False
        
    assert len(descriptiveColumns) == len(polynomialMap), f'descriptiveColumns = {descriptiveColumns} and polynomialMap = {polynomialMap}'
    sortedMap = [(key, polynomialMap[key]) for key in descriptiveColumns if key in polynomialMap]
    _, polynomialTuple = zip(*sortedMap)
    first_degree_polynomial_term = True if 1 in polynomialTuple else False
        
    if contains_intercept:
        df_X_pred = pd.DataFrame({'Intercept': np.ones(df_X.shape[0], dtype=int)})
    else:
        df_X_pred = pd.DataFrame(index=range(0, df_X.shape[0]))

    if first_degree_polynomial_term:
        index_independent = polynomialTuple.index(1)
        independent = descriptiveColumns[index_independent]
        independent_arr = np.linspace(start=df_X[independent].min(), stop=df_X[independent].max(), num=df_X.shape[0])
        for index, power in enumerate(polynomialTuple):
            column = descriptiveColumns[index]
            if index == index_independent: 
                df_X_pred[independent] = independent_arr
            else:
                df_X_pred[column] = independent_arr**power
    else:
        for index, power in enumerate(polynomialTuple):
            column = descriptiveColumns[index]
            if index == 0:
                sr_X_independent = df_X[column]**(1/power)
                X_pred_independent = np.linspace(start=sr_X_independent.min(), stop=sr_X_independent.max(), num=df_X.shape[0])

            df_X_pred[column] = X_pred_independent**power

    sr_Y_pred = fitted_model.predict(df_X_pred)

    # get prediction intervals
    try:
        from statsmodels.sandbox.regression.predstd import wls_prediction_std
        std_err_prediction, lower_pred_int, upper_pred_int = wls_prediction_std(fitted_model, exog=df_X_pred, alpha=alpha)
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise

        std_err_prediction, lower_pred_int, upper_pred_int = None, None, None

    # get confidence intervals
    try:
        result = fitted_model.get_prediction(df_X_pred)
        conf_int = result.conf_int(alpha=alpha)
        lower_conf_int, upper_conf_int = conf_int[:, 0], conf_int[:, 1]
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise   

        lower_conf_int, upper_conf_int = None, None

    if not first_degree_polynomial_term:
        df_X = df_X.copy()
        df_X[independent] = sr_X_independent
        df_X_pred[independent] = X_pred_independent
        
    fig, ax = plt_func.subplots(constrained_layout=True, figsize=(width, height))
    _ = ax.scatter(df_X[independent], df_Y, label='training data')
    _ = ax.plot(df_X_pred[independent], sr_Y_pred, '-', color='darkorchid', linewidth=2, label='prediction')
    if not lower_conf_int is None and not upper_conf_int is None:
        _ = ax.fill_between(df_X_pred[independent], lower_conf_int, upper_conf_int, color='#888888', alpha=0.4, label=f"confidence interval ({int((1-alpha)*100)}%)")
    if not lower_pred_int is None and not upper_pred_int is None:
        _ = ax.fill_between(df_X_pred[independent], lower_pred_int, upper_pred_int, color='#888888', alpha=0.1, label=f"prediction interval ({int((1-alpha)*100)}%)")

    _ = ax.legend()
    _ = ax.set_xlabel(independent)
    _ = ax.set_ylabel(dependent)
    _ = ax.set_title('regression of prediction vs training data')
    _ = ax.grid(True)

In [7]:
# def createLinearRegressionPlot(df_X, df_Y, fitted_model, independent='independent', alpha=0.05, df_independent=None, polynomialMap=None, width=8, height=3):
#     """
#     This function returns a scatter plot of the response and the predictor
#     of a simple linear regression or polynomial regression with one 
#     independent variable. Furthermore, the least squares regression line is 
#     shown with an associated confidence and prediction interval of 1-alpha.
    
#     Args:
#         df_X: pandas.core.frame.DataFrame
#             The DataFrame should hold the data of the independent variable
#             (and for polynomial regression the data should hold the data of
#             the polynomial variables, ie, x, x^2, etc). Each row in the 
#             DataFrame represents an individual sample point.
#         df_Y: pandas.core.frame.DataFrame
#             This should be a DataFrame of one column,
#             which holds the data of the dependent variable.            
#         fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
#             This statsmodels class summarizes the fit of a linear regression model
#             that has been fitted with df_X and df_Y.
#         alpha: float
#             This prediction and condidence intervals that are being shown are
#             of 1-alpha (e.g., alpha=0.05 corresponds to 95% confidence).
#         polynomialMap: dict, default None
#             This argument is used when plotting a polynomial regression. It is a 
#             dictionary that must contain the column names of the DataFrame X 
#             (excluding the intercept if the model is fitted with an intercept) as
#             keys with their associated polynomial degrees as values. For instance,
#             let us imagine that the model being fitted takes the form
#                 y = a + b*x^2 + c*x^5
#             then the polynomialMap argument should be
#                 mapping_powers = {
#                     'b': 2,
#                     'c': 5,
#                 }
#         independent: str, default 'independent'
#             A string describing the independent variable of the regression. This 
#             argument is only used when plotting a polynomial regression without
#             a term of degree 1.
#         df_independent: pd.core.frame.DataFrame, default None
#             This argument should be given when plotting a simple linear regression 
#             with transformed variables. if you want to plot some transformation 
#             of the independent variable. This DataFrame should hold the data 
#             of the independent variable before the transformation, and will be 
#             used to plot the regression instead of the transformed variable in
#             the DataFrame df_X.
#         width: float, default 8
#             The width of the plot.
#         height: float, default 3
#             The height of the plot.        
#     """ 
#     assert df_Y.shape[1] == 1
#     dependent = df_Y.columns[0]
#     descriptiveColumns = df_X.columns.tolist()
#     try:
#         descriptiveColumns.remove('Intercept')
#         contains_intercept = True
#     except ValueError as err:
#         if not 'list.remove(x): x not in list' in str(err):
#             raise

#         contains_intercept = False  

#     if len(descriptiveColumns) > 1:
#         assert len(descriptiveColumns) == len(polynomialMap), f'descriptiveColumns = {descriptiveColumns} and polynomialMap = {polynomialMap}'
#         sortedMap = [(key, polynomialMap[key]) for key in descriptiveColumns if key in polynomialMap]
#         _, polynomialTuple = zip(*sortedMap)
#         no_first_degree_polynomial_term = False if 1 in polynomialTuple else True
#     else:
#         no_first_degree_polynomial_term = False

#     if (contains_intercept == True and df_X.shape[1] == 2) or (contains_intercept == False and df_X.shape[1] == 1): # simple linear regression
#         independent = descriptiveColumns[0]
#         if contains_intercept == True:
#             df_X_pred = pd.DataFrame({
#                 'Intercept': np.ones(shape=(df_X.shape[0], ), dtype=int), 
#                 independent: np.linspace(start=df_X[independent].min(), stop=df_X[independent].max(), num=df_X.shape[0]),
#             }, columns = ['Intercept', independent])
#         else:
#             df_X_pred = pd.DataFrame({independent: np.linspace(start=df_X[independent].min(), stop=df_X[independent].max(), num=df_X.shape[0])})
#     else: # in case of a polynomial regression with one independent variable
#         if contains_intercept == True:
#             df_X_pred = pd.DataFrame({'Intercept': np.ones(df_X.shape[0], dtype=int)})
#         else:
#             df_X_pred = pd.DataFrame(index=range(0, df_X.shape[0]))

#         if no_first_degree_polynomial_term == False:
#             index_independent = polynomialTuple.index(1)
#             independent = descriptiveColumns[index_independent]
#             for index, power in enumerate(polynomialTuple):
#                 column = descriptiveColumns[index]
#                 if index == index_independent: 
#                     df_X_pred[independent] = np.linspace(start=df_X[independent].min(), stop=df_X[independent].max(), num=df_X.shape[0])
#                 else:
#                     df_X_pred[column] = df_X_pred[independent]**power
#         else:
#             for index, power in enumerate(polynomialTuple):
#                 column = descriptiveColumns[index]
#                 if index == 0:
#                     df_X_pred.head()
#                     sr_X_independent = df_X[column]**(1/power)
#                     df_X.shape[0]
#                     X_pred_independent = np.linspace(start=sr_X_independent.min(), stop=sr_X_independent.max(), num=df_X.shape[0])

#                 df_X_pred[column] = X_pred_independent**power

#     sr_Y_pred = fitted_model.predict(df_X_pred)

#     # get prediction intervals
#     try:
#         from statsmodels.sandbox.regression.predstd import wls_prediction_std
#         std_err_prediction, lower_pred_int, upper_pred_int = wls_prediction_std(fitted_model, exog=df_X_pred, alpha=alpha)
#     except AttributeError as err:
#         if not '\'float\' object has no attribute \'sqrt\'' in str(err):
#             raise

#         std_err_prediction, lower_pred_int, upper_pred_int = None, None, None

#     # get confidence intervals
#     try:
#         result = fitted_model.get_prediction(df_X_pred)
#         conf_int = result.conf_int(alpha=alpha)
#         lower_conf_int, upper_conf_int = conf_int[:, 0], conf_int[:, 1]
#     except AttributeError as err:
#         if not '\'float\' object has no attribute \'sqrt\'' in str(err):
#             raise   

#         lower_conf_int, upper_conf_int = None, None

#     if no_first_degree_polynomial_term == True:
#         df_X = df_X.copy()
#         df_X[independent] = sr_X_independent
#         df_X_pred[independent] = X_pred_independent
#     elif not df_independent is None:
#         assert isinstance(df_independent, pd.core.frame.DataFrame), 'df_independent must be a pandas.core.frame.DataFrame'
#         assert df_independent.shape[1] == 1
#         independent = df_independent.columns[0]
#         old_shape = df_X.shape[0]
#         df_X = pd.concat([df_X, df_independent], axis=1)
#         new_shape = df_X.shape[0]
#         assert old_shape == new_shape, f'old_shape = {old_shape} and new_shape = {new_shape}'
#         df_X_pred[independent] = np.linspace(start=df_X[independent].min(), stop=df_X[independent].max(), num=df_X.shape[0])
        
#     fig, ax = plt_func.subplots(constrained_layout=True, figsize=(width, height))
#     _ = ax.scatter(df_X[independent], df_Y, label='training data')
#     _ = ax.plot(df_X_pred[independent], sr_Y_pred, '-', color='darkorchid', linewidth=2, label='prediction')
#     if not lower_conf_int is None and not upper_conf_int is None:
#         _ = ax.fill_between(df_X_pred[independent], lower_conf_int, upper_conf_int, color='#888888', alpha=0.4, label="confidence interval")
#     if not lower_pred_int is None and not upper_pred_int is None:
#         _ = ax.fill_between(df_X_pred[independent], lower_pred_int, upper_pred_int, color='#888888', alpha=0.1, label="prediction interval")

#     _ = ax.legend()
#     _ = ax.set_xlabel(independent)
#     _ = ax.set_ylabel(dependent)
#     _ = ax.set_title(f'regression of prediction vs training data (with confidence and prediction intervals of {(1-alpha)*100:.2f}%)')
#     _ = ax.grid(True)

In [8]:
def createConfusionMatrixFromLogisticModel(fitted_model, threshold=0.5, binaryMap={0: 0, 1: 1}):
    """
    This function returns two confusion matrices in terms of absolute 
    numbers and percentages, respectively, based on in-sample data fitted
    with a logistic regression model.
    
    Args:
        fitted_model: statsmodels.discrete.discrete_model.LogitResults
            This statsmodels class summarizes the fit of a logistic 
            regression model.
        threshold: float, default 0.5
            Number between 0 and 1. Threshold above which a prediction
            is considered 1 and below which a prediction is considered 0.
        binaryMap: dictionary, default {0: 0, 1: 1}
            A mapping of the binary 0 and 1 quantative variables to their
            associated qualitative name.
    """
   
    # pred_table[i,j] refers to the number of times “i” was observed
    # and the model predicted “j”. Correct predictions are along the diagonal.
    confusion = fitted_model.pred_table(threshold=0.5).astype(int)
    
    index = pd.MultiIndex.from_tuples([('Observed', binaryMap[0]), ('Observed', binaryMap[1])])
    columns = pd.MultiIndex.from_tuples([('Predicted', binaryMap[0]), ('Predicted', binaryMap[1])])
    df_confusion = pd.DataFrame(confusion, columns=columns, index=index)
    
    # TN, FP, FN and TP denote the 'true negative', 'false positive',
    # 'false negative' and 'true positive', respectively.
    TN, FP, FN, TP = confusion[0, 0], confusion[0, 1], confusion[1, 0], confusion[1, 1]
    TNR = TN / (TN + FP)  # true negative rate
    FPR = FP / (TN + FP)  # false positive rate
    FNR = FN / (TP + FN)  # false negative rate
    TPR = TP / (TP + FN)  # true positive rate
    confusion_pct = 100 * np.array([
        [TNR, FPR],
        [FNR, TPR]
    ])
    index_pct = pd.MultiIndex.from_tuples([('Observed (%)', binaryMap[0]), ('Observed (%)', binaryMap[1])])
    columns_pct = pd.MultiIndex.from_tuples([('Predicted (%)', binaryMap[0]), ('Predicted (%)', binaryMap[1])])
    df_confusion_pct = pd.DataFrame(confusion_pct, columns=columns_pct, index=index_pct)
    
    return df_confusion, df_confusion_pct

In [9]:
def createConfusionMatrixFromOutOfSampleData(df, binaryMap={0: 0, 1: 1}):
    """
    This function returns two confusion matrices in terms of absolute 
    numbers and percentages, respectively, based on out-of-sample data.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame used to produce the confusion matrices. It 
            should have a column named 'Observed' and one column named
            'Predicted' which contains the binary values (0 or 1) of 
            the observed and predicted data, respectively.
        binaryMap: dictionary, default {0: 0, 1: 1}
            A mapping of the binary 0 and 1 quantative variables to their
            associated qualitative name.
    """
    
    TP = np.sum(np.where((df['Observed'] == 1) & (df['Predicted'] == 1), 1, 0))
    TN = np.sum(np.where((df['Observed'] == 0) & (df['Predicted'] == 0), 1, 0))
    FP = np.sum(np.where((df['Observed'] == 0) & (df['Predicted'] == 1), 1, 0))
    FN = np.sum(np.where((df['Observed'] == 1) & (df['Predicted'] == 0), 1, 0))
    
    confusion = np.array([
        [TN, FP],
        [FN, TP]
    ])
    
    index = pd.MultiIndex.from_tuples([('Observed', binaryMap[0]), ('Observed', binaryMap[1])])
    columns = pd.MultiIndex.from_tuples([('Predicted', binaryMap[0]), ('Predicted', binaryMap[1])])
    df_confusion = pd.DataFrame(confusion, columns=columns, index=index)
    
    TNR = TN / (TN + FP)  # true negative rate
    FPR = FP / (TN + FP)  # false positive rate
    FNR = FN / (TP + FN)  # false negative rate
    TPR = TP / (TP + FN)  # true positive rate
    confusion_pct = 100 * np.array([
        [TNR, FPR],
        [FNR, TPR]
    ])
    index_pct = pd.MultiIndex.from_tuples([('Observed (%)', binaryMap[0]), ('Observed (%)', binaryMap[1])])
    columns_pct = pd.MultiIndex.from_tuples([('Predicted (%)', binaryMap[0]), ('Predicted (%)', binaryMap[1])])
    df_confusion_pct = pd.DataFrame(confusion_pct, columns=columns_pct, index=index_pct)
    
    return df_confusion, df_confusion_pct