In [1]:
from sklearn import preprocessing
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [2]:
def removeColumnsFromList(df, columnsToRemove):
    """
    Return a list of columns names excluding the names in the list 
    `columnsToKeep`.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame used to produce the list of column names. 
        
        columnsToRemove: iterable
            An iterable object that has the names as elements that
            will be excluded from the returned list.
    Returns:
        list: The aforementioned column names.
    """
    columns = df.columns.tolist()
    for column in columnsToRemove:
        columns.remove(column)
        
    return columns

<p>We could easily reproduce these type of plots <a href="https://www.statsmodels.org/dev/examples/notebooks/generated/regression_plots.html">very closely</a>, with <a href="http://www.statsmodels.org/dev/examples/notebooks/generated/regression_diagnostics.html">additional regression diagnostics</a>, using the <code>statsmodels</code> library, however I would like to have more control, so I decided to plot most of them manually using <code>matplotlib</code>.

In [3]:
def createResidualPlots(X, Y, fitted_model, width=7, height=3):
    """
    This function returns various residual plots for the fitted model.
    
    For simple linear regressions, the first two plots are plots of the 
    residuals and the square root of the absolute standardized residuals
    vs the predictor. For the multiple regression fit, we instead plots 
    the residuals and the square root of the absolute standardized 
    residuals vs the fitted values. The third plot is a QQ plot of the
    quantiles of the standardized residuals vs the quantiles of the 
    normal distribution, and a 45-degree reference line is also plotted 
    for comparison (see also 
    https://seankross.com/2016/02/29/A-Q-Q-Plot-Dissection-Kit.html). The 
    final plot is a leverage plot of the standardized residuals.
    
    Args:
        X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of independent variables
            (including a column for the 'Intercept' set equal to one).
            Each row in the DataFrame represents an individual sample 
            point, with the successive columns corresponding to the 
            independent variables and their specific values for that 
            sample point. 
        Y: pandas.core.frame.DataFrame or pandas.core.series.Series
            This should be a pandas Series of DataFrame of one column,
            which holds the data of the dependent variable.
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model.
        width: float, default 7
            The width of each subplot.
        height: float, default 3
            The height of each subplot.
    """ 
    columns_dependent_var = X.columns.values.tolist()
    columns_dependent_var.remove('Intercept')
    assert len(columns_dependent_var) >= 1, f'columns_dependent_var = {columns_dependent_var}'
    approach = 'simple' if len(columns_dependent_var) == 1 else 'multivariable'
    
    Y_hat = fitted_model.predict(X)
    residual = np.squeeze(Y.to_numpy()) - Y_hat
    standardized_residual = preprocessing.scale(residual)
    
    X_np = X.to_numpy()
    H = X_np @ np.linalg.inv(X_np.transpose() @ X_np) @ X_np.transpose()
    leverage = H.diagonal() 
    
    numberOfSubplots = 4
    fig, axes = plt.subplots(numberOfSubplots, 1, constrained_layout=True, figsize=(width, height*numberOfSubplots))
    if approach == 'simple':
        descriptive = columns_dependent_var[0]
        X_plot = X[descriptive]
    else:
        descriptive = 'fitted values'
        X_plot = Y_hat
    
    _ = axes[0].scatter(x=X_plot, y=residual)
    _ = axes[0].set_xlabel(descriptive)
    _ = axes[0].set_ylabel('residuals')
    _ = axes[0].set_title(f'residual plot for the linear regression')
    
    _ = axes[1].scatter(x=X_plot, y=np.absolute(standardized_residual)**0.5)
    _ = axes[1].set_xlabel(descriptive)
    _ = axes[1].set_ylabel(r'$\sqrt{\left|\mathrm{standardized \,\,\, residuals}\right|}$')
    _ = axes[1].set_title(r'$\sqrt{\left|\mathrm{standardized \,\,\, residuals}\right|}$ for the linear regression')
    
    _ = sm.qqplot(standardized_residual, line='45', c=None, ax=axes[2])
    _ = axes[2].set_xlabel('theoretical qunatiles')
    _ = axes[2].set_ylabel('standardized residuals')
    _ = axes[2].set_title('normal qq plot')
    
    _ = axes[3].scatter(x=leverage, y=standardized_residual)
    _ = axes[3].set_xlabel('leverage')
    _ = axes[3].set_ylabel('standardized residuals')
    _ = axes[3].set_title(f'standardized residuals vs leverage')

In [4]:
def createSimpleLinearRegressionPlot(X, Y, fitted_model, independent, dependent, alpha=0.05):
    """
    This function returns a scatter plot of the response and the predictor. 
    Furthermore, the least squares regression line is shown with an 
    associated confidence and prediction interval of 1-alpha.
    
    Args:
        X: pandas.core.frame.DataFrame
            The DataFrame should hold the data of the independent variable
            and a column for the 'Intercept' that is set equal to one.
            Each row in the DataFrame represents an individual sample 
            point of the independent variable. 
        Y: pandas.core.frame.DataFrame or pandas.core.series.Series
            This should be a pandas Series of DataFrame of one column,
            which holds the data of the dependent variable.
        fitted_model: statsmodels.regression.linear_model.RegressionResultsWrapper
            This statsmodels class summarizes the fit of a linear regression model.
        independent: str
            A string describing the independent variable of the regression.
        dependent: str
            A string describing the dependent variable of the regression.
        alpha: float
            This prediction and condidence intervals that are being shown are
            of 1-alpha.
    """ 
    # alpha=0.05 corresponds to 95% confidence
    X_pred = np.linspace(start=X[independent].min(), stop=X[independent].max(), num=X.shape[0])
    intercepts = np.ones(X_pred.shape, dtype=int)
    descriptiveColumns = X.columns.values.tolist()
    X_pred = pd.DataFrame({
        'Intercept': intercepts, 
        independent: X_pred
    }, columns = descriptiveColumns)
    Y_pred = fitted_model.predict(X_pred)
    Y_pred.name = f'predicted {dependent}'
    # get prediction intervals
    std_err_prediction, lower_pred_int, upper_pred_int = wls_prediction_std(fitted_model, exog=X_pred, alpha=alpha)
    # get confidence intervals
    result = fitted_model.get_prediction(X_pred)
    conf_int = result.conf_int(alpha=alpha)
    lower_conf_int, upper_conf_int = conf_int[:, 0], conf_int[:, 1]

    fig, ax = plt.subplots(constrained_layout=True, figsize=(9, 6))
    _ = ax.scatter(X[independent], Y, label='training data')
    _ = ax.plot(X_pred[independent], Y_pred, '-', color='darkorchid', linewidth=2, label='prediction')
    _ = ax.fill_between(X_pred[independent], lower_conf_int, upper_conf_int, color='#888888', alpha=0.4, label="confidence interval")
    _ = ax.fill_between(X_pred[independent], lower_pred_int, upper_pred_int, color='#888888', alpha=0.1, label="prediction interval")
    _ = ax.legend()
    _ = ax.set_xlabel(independent)
    _ = ax.set_ylabel(dependent)
    _ = ax.set_title(f'regression of prediction vs training data (with confidence and prediction intervals of {(1-alpha)*100:.2f}%)')
    _ = ax.grid(True)