In [1]:
import matplotlib.pyplot as plt_func

In [2]:
def removeColumnsFromList(df, columnsToRemove):
    """
    Return a list of columns names excluding the names in the list 
    `columnsToKeep`.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame used to produce the list of column names. 
        
        columnsToRemove: iterable
            An iterable object that has the names as elements that
            will be excluded from the returned list.
    Returns:
        list: The aforementioned column names.
    """
    columns = df.columns.tolist()
    for column in columnsToRemove:
        try:
            columns.remove(column)
        except ValueError as err:
            if not 'list.remove(x): x not in list' in str(err):
                raise

    return columns

<p>We could easily reproduce these type of plots <a href="https://www.statsmodels.org/dev/examples/notebooks/generated/regression_plots.html">very closely</a>, with <a href="http://www.statsmodels.org/dev/examples/notebooks/generated/regression_diagnostics.html">additional regression diagnostics</a>, using the <code>statsmodels</code> library, however I would like to have more control, so I decided to plot most of them manually using <code>matplotlib</code>.

In [3]:
def createResidualPlots(X, Y, fitted_model, list_of_indices=[], width=7, height=3):
    """
    This function returns various residual plots for the fitted model.
    
    For linear regressions, the first two plots are plots of the 
    residuals and the square root of the absolute standardized residuals
    vs the predictor. For the multiple regression fit, we instead plots 
    the residuals and the square root of the absolute standardized 
    residuals vs the fitted values. The third plot is a QQ plot of the
    quantiles of the standardized residuals vs the quantiles of the 
    normal distribution, and a 45-degree reference line is also plotted 
    for comparison (see also 
    https://seankross.com/2016/02/29/A-Q-Q-Plot-Dissection-Kit.html). The 
    final plot is a leverage plot of the standardized residuals.
    
    Args:
        X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of independent variables
            (including a column for the 'Intercept' set equal to one).
            Each row in the DataFrame represents an individual sample 
            point, with the successive columns corresponding to the 
            independent variables and their specific values for that 
            sample point. 
        Y: pandas.core.frame.DataFrame or pandas.core.series.Series
            This should be a pandas Series of DataFrame of one column,
            which holds the data of the dependent variable.
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model.
        list_of_indices: list, default list()
            A list that hold indices indicating which data point(s) want to 
            be colored differently to distinguish those point(s) from the 
            rest of the data.
        width: float, default 7
            The width of each subplot.
        height: float, default 3
            The height of each subplot.
    """ 
    columns_dependent_var = X.columns.tolist()
    try:
        columns_dependent_var.remove('Intercept')
    except ValueError as err:
        if not 'list.remove(x): x not in list' in str(err):
            raise
            
    assert len(columns_dependent_var) >= 1, f'columns_dependent_var = {columns_dependent_var}'
    approach = 'simple' if len(columns_dependent_var) == 1 else 'multivariable'
    
    Y_hat = fitted_model.predict(X)
    residual = np.squeeze(Y.to_numpy()) - Y_hat
    from sklearn import preprocessing
    standardized_residual = preprocessing.scale(residual)
    
    X_np = X.to_numpy()
    try:
        H = X_np @ np.linalg.inv(X_np.transpose() @ X_np) @ X_np.transpose()
        leverage = H.diagonal() 
    except np.linalg.LinAlgError as err:
        if not 'Singular matrix' in str(err):
            raise
            
        leverage = None
    
    
    numberOfSubplots = 4
    fig, axes = plt_func.subplots(numberOfSubplots, 1, constrained_layout=True, figsize=(width, height*numberOfSubplots))
    if approach == 'simple':
        descriptive = columns_dependent_var[0]
        X_plot = X[descriptive].to_numpy()
    else:
        descriptive = 'fitted values'
        X_plot = Y_hat
        
    mask_special_indices = np.zeros(residual.shape[0], dtype=bool)
    mask_special_indices[list_of_indices] = True
    
    
    from matplotlib import colors
    default_colors = plt_func.rcParams['axes.prop_cycle'].by_key()['color']
    cmap = colors.ListedColormap(default_colors[:2])
    
    _ = axes[0].scatter(x=X_plot, y=residual, c=mask_special_indices, cmap=cmap)
    _ = axes[0].set_xlabel(descriptive)
    _ = axes[0].set_ylabel('residuals')
    _ = axes[0].set_title(f'residual plot for the linear regression')
    
    _ = axes[1].scatter(x=X_plot, y=np.absolute(standardized_residual)**0.5, c=mask_special_indices, cmap=cmap)
    _ = axes[1].set_xlabel(descriptive)
    _ = axes[1].set_ylabel(r'$\sqrt{\left|\mathrm{standardized \,\,\, residuals}\right|}$')
    _ = axes[1].set_title(r'$\sqrt{\left|\mathrm{standardized \,\,\, residuals}\right|}$ for the linear regression')
        
    n = Y_hat.shape[0] + 1
    q_list = np.linspace(start=1/n, stop=1, num=n)
    quantiles_data = np.sort(standardized_residual)
    from scipy import stats
    quantiles_theoretical = stats.norm.ppf(q_list)[:-1]  # remove infinity from array
    _ = axes[2].scatter(x=quantiles_theoretical, y=quantiles_data, c=mask_special_indices, cmap=cmap)
    x_min, x_max = axes[2].get_xlim()
    y_min, y_max = axes[2].get_ylim()
    axes[2].plot((x_min, x_max), (y_min, y_max), color='black', label='45-degree line')
    _ = axes[2].set_xlabel('normal distribution quantiles')
    _ = axes[2].set_ylabel('standardized residuals quantiles')
    _ = axes[2].set_title('normal qq plot')
    _ = axes[2].legend()
    
    if not leverage is None:
        _ = axes[3].scatter(x=leverage, y=standardized_residual, c=mask_special_indices, cmap=cmap)
        _ = axes[3].set_xlabel('leverage')
        _ = axes[3].set_ylabel('standardized residuals')
        _ = axes[3].set_title(f'standardized residuals vs leverage')

In [4]:
def createLinearRegressionPlot(X, Y, fitted_model, independent='independent', dependent='dependent', alpha=0.05, width=8, height=3, polynomialLst=None, x_min=None, x_max=None):
    """
    This function returns a scatter plot of the response and the predictor
    of a simple linear regression or polynomial regression with one 
    independent variable. Furthermore, the least squares regression line is 
    shown with an associated confidence and prediction interval of 1-alpha.
    
    Args:
        X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of the independent variable
            (and for polynomial regression the data should hold the data of
            the polynomial variables, ie, x, x^2, etc). Each row in the 
            DataFrame represents an individual sample point. 
        Y: pandas.core.frame.DataFrame or pandas.core.series.Series
            This should be a pandas Series of DataFrame of one column,
            which holds the data of the dependent variable.
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model.
        independent: str, default 'independent'
            A string describing the independent variable of the regression. This 
            argument only has use when plotting a polynomial regression without
            a term of degree 1.
        dependent: str, default 'dependent'
            A string describing the dependent variable of the regression.
        alpha: float
            This prediction and condidence intervals that are being shown are
            of 1-alpha (e.g., alpha=0.05 corresponds to 95% confidence).
        width: float, default 8
            The width of each subplot.
        height: float, default 3
            The height of each subplot.
        polynomialLst: list, default None
            In case of a polynomial regression, the list must contain the
            powers (as integers) of the polynomial terms (ignoring the 
            intercept term) that are used for the regression. For instance, 
            if the fitted model takes the form
                y = a + b*x^2 + c*x^5 ,
            then the polynomialLst argument should be
                polynomialLst = [2, 5] .
            This list must be in the same order as the column names of the X
            DataFrame (ignoring the possible 'Intercept' column).
        x_min: float, default None
            This argument must be given in case of a polynomial that is 
            fitted without the term of degree 1; it should equal the minimum 
            value of the independent variable x. This variable is used
            for plotting the polynomial regression line.
        x_max: float, default None
            This argument must be given in case of a polynomial that is 
            fitted without the term of degree 1; it should equal the maximum 
            value of the independent variable x. This variable is used
            for plotting the polynomial regression line.
    """ 
    intercept_bool = 'Intercept' in X.columns
    if (intercept_bool == True and X.shape[1] == 2) or (X.shape[1] == 1): # simple linear regression
        descriptiveColumns = X.columns.tolist()
        try:
            descriptiveColumns.remove('Intercept')
        except ValueError as err:
            if not 'list.remove(x): x not in list' in str(err):
                raise
        independent = descriptiveColumns[0]
        X_pred = np.linspace(start=X[independent].min(), stop=X[independent].max(), num=X.shape[0])
        if intercept_bool == True:
            X_pred = pd.DataFrame({
                'Intercept': np.ones(X_pred.shape, dtype=int), 
                independent: X_pred,
            }, columns = ['Intercept', independent])
        else:
            X_pred = pd.DataFrame({independent: X_pred})
    else: # in case of a polynomial regression with one independent variable
        assert not polynomialLst is None
        descriptiveColumns = X.columns.tolist()
        try:
            descriptiveColumns.remove('Intercept')
            intercepts = np.ones(X.shape[0], dtype=int)
            X_pred = pd.DataFrame({'Intercept': intercepts})
        except ValueError as err:
            if not 'list.remove(x): x not in list' in str(err):
                raise

            X_pred = pd.DataFrame({})
        
        assert len(descriptiveColumns) == len(polynomialLst)
        
        try:
            index_independent = polynomialLst.index(1)
            independent = descriptiveColumns[index_independent]  # the possible 'Intercept' has been removed from descriptiveColumns
            for index, power in enumerate(polynomialLst):
                column = descriptiveColumns[index]  # the possible 'Intercept' has been removed from descriptiveColumns
                if index == index_independent: 
                    X_pred[independent] = np.linspace(start=X[independent].min(), stop=X[independent].max(), num=X.shape[0])
                else:
                    X_pred[column] = X_pred[independent]**power
        except ValueError as err:
            # not tested at all!
            if not 'is not in list' in str(err):
                raise

            assert not x_min is None and not x_max is None
            X_pred_independent = np.linspace(start=x_min, stop=x_max, num=X.shape[0])
            for index, power in enumerate(polynomialLst):
                column = descriptiveColumns[index]
                X_pred[column] = X_pred_independent**power

    Y_pred = fitted_model.predict(X_pred)

    # get prediction intervals
    try:
        from statsmodels.sandbox.regression.predstd import wls_prediction_std
        std_err_prediction, lower_pred_int, upper_pred_int = wls_prediction_std(fitted_model, exog=X_pred, alpha=alpha)
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise   

        std_err_prediction, lower_pred_int, upper_pred_int = None, None, None

    # get confidence intervals
    try:
        result = fitted_model.get_prediction(X_pred)
        conf_int = result.conf_int(alpha=alpha)
        lower_conf_int, upper_conf_int = conf_int[:, 0], conf_int[:, 1]
    except AttributeError as err:
        if not '\'float\' object has no attribute \'sqrt\'' in str(err):
            raise   

        lower_conf_int, upper_conf_int = None, None

    fig, ax = plt_func.subplots(constrained_layout=True, figsize=(width, height))
    _ = ax.scatter(X[independent], Y, label='training data')
    _ = ax.plot(X_pred[independent], Y_pred, '-', color='darkorchid', linewidth=2, label='prediction')
    if not lower_conf_int is None and not upper_conf_int is None:
        _ = ax.fill_between(X_pred[independent], lower_conf_int, upper_conf_int, color='#888888', alpha=0.4, label="confidence interval")
    if not lower_pred_int is None and not upper_pred_int is None:
        _ = ax.fill_between(X_pred[independent], lower_pred_int, upper_pred_int, color='#888888', alpha=0.1, label="prediction interval")
    _ = ax.legend()
    _ = ax.set_xlabel(independent)
    _ = ax.set_ylabel(dependent)
    _ = ax.set_title(f'regression of prediction vs training data (with confidence and prediction intervals of {(1-alpha)*100:.2f}%)')
    _ = ax.grid(True)