In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tabulate
import scipy.stats as sc_stats

from IPython.display import display

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({
  'font.size': 12,
  'grid.alpha': 0.25})

## Function declarations



### Read `ods` to `DataFrame`

In [3]:
def read_ods(filename, columns=None):
  if columns is None:
    df = pd.read_excel(filename, engine="odf", header=0)
  elif isinstance(columns, list):
    df = pd.read_excel(filename, engine="odf", header=0,
                       usecols=columns)
  else:
    key_arr = (k for k in columns.keys())
    df = pd.read_excel(filename, engine="odf", header=0,
                      usecols=key_arr)
    df = df.rename(columns=columns)
  return df

### Get multiple regression coefficients

In [4]:
def get_multiple_regress_coeff(df, n_vars=2):
  
  if isinstance(df, pd.DataFrame):
    num_of_samples = len(df)
    X_arr = np.zeros((num_of_samples, n_vars))
    Y = df["Y"].to_numpy()

    for i in range(n_vars):
      key = f"X_{i+1}"
      X_arr[:, i] = df[key].to_numpy()

    regress = LinearRegression().fit(X_arr, Y)
    df["hat_Y"] = regress.intercept_ + X_arr.dot(regress.coef_) 
  
  elif isinstance(df, np.ndarray):
    Y = df[:, 0]
    X_arr = df[:, 1:]


    regress = LinearRegression().fit(X_arr, Y)
    Y_hat = regress.intercept_ + X_arr.dot(regress.coef_)
    df = np.column_stack([df, Y_hat])

  return regress, df


### Get Minitab output correlation

In [5]:
def get_minitab_corr(df, n_vars=2, new_column=None):
  """
  new_column (dict) : a dictionary mapping to rename df header
  """
  
  data_struct = np.zeros((len(df), 1+n_vars))
  data_struct[:, 0] = df['Y']
  for i in range(n_vars):
    key = f"X_{i+1}"
    data_struct[:, i+1] = df[key]

  # -- compute correlation matrix
  corr_matrix = np.corrcoef(data_struct, rowvar=False)
  corr_matrix = corr_matrix[1:, 0:-1]    # only show lower triangle part of correlation matrix

  # -- create tabular form for correlation 
  data = []
  for i in range(n_vars):
    data_row = [new_column[i+1]] + \
      [f"{corr:.3f}" if j < i+1 else "" for j, corr in enumerate(corr_matrix[i,:])]
    #print(data_row)
    #print([type(data_row_i) for data_row_i in data_row])
    data.append(data_row)

  table_corr = tabulate.tabulate(data, tablefmt='html', 
    headers=[""] + new_column[:n_vars], 
    floatfmt=["None"] + [".3f"]*(n_vars))

  print(f"Correlations: {', '.join(new_column)}")
  display(table_corr)


### Get stepwise regression table

In [12]:
def get_stepwise_regress_table(df, predictors=None, significance_lvl=0.05):

  num_of_samples = len(df)
  num_of_vars = len(predictors)

  data_struct = np.zeros((num_of_samples, 1+num_of_vars))
  data_struct[:, 0] = df["Y"]

  for i, predictor in enumerate(predictors):
    data_struct[:, i+1] = df[predictor]

  # -- compute regression coefficients
  regress, _ = get_multiple_regress_coeff(data_struct, n_vars=len(predictors))

  b_arr = [regress.intercept_] + regress.coef_.tolist()

  # -- compute cofactor matrix 
  X_arr = np.ones_like(data_struct)
  X_arr[:,1:] = data_struct[:,1:]
  cofactor_matrix = np.linalg.inv(X_arr.transpose().dot(X_arr))   # this matrix is closely related to covariance matrix

  # -- compute standard error of the estimate
  hat_Y =  b_arr[0] + data_struct[:,1:].dot(b_arr[1:])
  sumSq_Y_hat_Y = ((data_struct[:, 0] - hat_Y)**2).sum()
  s_yxs = np.sqrt(sumSq_Y_hat_Y/(num_of_samples - num_of_vars - 1))

  # -- compute t-score and p-value
  SE_coef = np.zeros(1+num_of_vars)
  t_scores = np.zeros(1+num_of_vars)
  p_values = np.zeros(1+num_of_vars)
  dof = num_of_samples - num_of_vars - 1;          # degrees of freedom
  for i in range(1+num_of_vars):
    SE_coef[i] = s_yxs*np.sqrt(cofactor_matrix[i, i])
    t_scores[i] = b_arr[i] / SE_coef[i]
    p_values[i] = sc_stats.t.sf(abs(t_scores[i]), dof) * 2   # .t.sf is a surfifal function (1 - cdf)

  t_scores = t_scores[1:]     # exclude t_scores for intercept
  p_values = p_values[1:]     # exclude p_values for intercept

  # -- compute t_score_min and t_score_max for given significance_lvl
  l_bound = sc_stats.t.ppf(significance_lvl/2, dof)
  u_bound = sc_stats.t.ppf(1-significance_lvl/2, dof)


  # -- compute R_sq and adj_R_sq
  meanY = data_struct[:, 0].mean()
  SST = ((data_struct[:, 0] - meanY)**2).sum()
  SSR = ((hat_Y - meanY)**2).sum()
  R_sq = SSR/SST
  adj_R_sq = 1 - (1 - R_sq)*(num_of_samples - 1) / (num_of_samples - num_of_vars - 1)

  out_dict = {
    "b_arr": b_arr,
    "l_u_bound": [l_bound, u_bound],
    "t_scores": t_scores,
    "p_values": p_values,
    "s_yxs": s_yxs,
    "R_sq": R_sq,
    "adj_R_sq": adj_R_sq}

  return out_dict

## Load the data

In [7]:
filename = "06-table-13-zurenko-pharmaceutical-data.ods"

df = read_ods(filename)

df.head()


Unnamed: 0,One Month’s Sales (units),Aptitude Test Score,Age (years),Anxiety Test Score,Experience (years),High School GPA
0,44,10,22.1,4.9,0,2.4
1,47,19,22.5,3.0,1,2.6
2,60,27,23.1,1.5,0,2.8
3,71,31,24.0,0.6,3,2.7
4,61,64,22.6,1.8,2,2.0


## Observation to the correlation matrix

In [8]:
df_minitab = df.copy()
from_column_names = df_minitab.columns.to_list()
print(f"header: {from_column_names}")
to_column_names = ["Y"] + [f"X_{i+1}" for i in range(len(from_column_names) - 1)]


df_minitab = df_minitab.rename(columns=
  {k: v for k, v in zip(from_column_names, to_column_names)})

df_minitab.head()

header: ['One Month’s Sales (units)', 'Aptitude Test Score', 'Age (years)', 'Anxiety Test Score', 'Experience (years)', 'High School GPA']


Unnamed: 0,Y,X_1,X_2,X_3,X_4,X_5
0,44,10,22.1,4.9,0,2.4
1,47,19,22.5,3.0,1,2.6
2,60,27,23.1,1.5,0,2.8
3,71,31,24.0,0.6,3,2.7
4,61,64,22.6,1.8,2,2.0


In [9]:
column_name = ["Sales", "Aptitude", "Age", "Anxiety", "Experience", "GPA"]
out_dict = get_minitab_corr(df_minitab, n_vars=len(column_name)-1, 
  new_column=column_name)


Correlations: Sales, Aptitude, Age, Anxiety, Experience, GPA


Unnamed: 0,Sales,Aptitude,Age,Anxiety,Experience
Aptitude,0.676,,,,
Age,0.798,0.228,,,
Anxiety,-0.296,-0.222,-0.287,,
Experience,0.55,0.35,0.54,-0.279,
GPA,0.622,0.318,0.695,-0.244,0.312


The `Age` variable will enter the model first because
it has the largest correlation with sales ($r_{1,3} = .798$)
and will explain $63.7\%$ ($= .798^2$) of the variation in sales.

The `Aptitude` test score will probably enter the model
second because it is strongly related to `Salses` ($r_{1,2} = .676$)
but not highly related to the `Age` variable ($r_{2,3} = .228$)
already in the model.

The other variables will probably not qualify as good
predictor variables. The `Anxiety` test score will not be 
a good predictor because it is not well related to `Sales` 
($r_{1,4} = -.296$). The `Experience` and `GPA` variables
might have potential as good predictor variables 
($r_{1,5} = .550$ and $r_{1,6} = .622$, respectively).
However, both of these predictor variables have a potential
multicollinearity problem with the `Age` variable
($r_{3,5} = .540$ and $r_{3,6} = .695$, respectively).

## Stepwise regression

In [10]:
column_name

['Sales', 'Aptitude', 'Age', 'Anxiety', 'Experience', 'GPA']

### Step 1. Add (`Age` = `X_2`) predictor

In [13]:
stepwise_out_dict = get_stepwise_regress_table(df_minitab, predictors=["X_2"])

stepwise_out_dict

{'b_arr': [-100.8525512677303, 6.967985331000906],
 'l_u_bound': [-2.048407141795244, 2.048407141795244],
 't_scores': array([7.010065]),
 'p_values': array([1.26680515e-07]),
 's_yxs': 6.846830261718064,
 'R_sq': 0.6370283522612091,
 'adj_R_sq': 0.6240650791276809}

### Step 2. Add (`Aptitude` = `X_1`) predictor

In [14]:
stepwise_out_dict = get_stepwise_regress_table(df_minitab, predictors=["X_2", "X_1"])
stepwise_out_dict

{'b_arr': [-86.79153589150668, 5.931446523233715, 0.19973462539954492],
 'l_u_bound': [-2.051830516480284, 2.0518305164802833],
 't_scores': array([10.59875045,  8.13401176]),
 'p_values': array([4.0191891e-11, 9.7567780e-09]),
 's_yxs': 3.753607274840936,
 'R_sq': 0.8948045448411508,
 'adj_R_sq': 0.8870122889034583}

### Step 3. Add one by one the other predictors

**Add `X_3`** 

In [15]:
stepwise_out_dict = get_stepwise_regress_table(df_minitab, predictors=["X_2", "X_1", "X_3"])
stepwise_out_dict

{'b_arr': [-87.96559019781101,
  5.967394914312336,
  0.20078074412577457,
  0.11809607347611023],
 'l_u_bound': [-2.0555294386428713, 2.055529438642871],
 't_scores': array([10.14630787,  7.91894399,  0.24564281]),
 'p_values': array([1.56538358e-10, 2.13994676e-08, 8.07887465e-01]),
 's_yxs': 3.820680063406125,
 'R_sq': 0.8950481155270208,
 'adj_R_sq': 0.8829382827032155}

`t_score` of `X_3` is not significant because it is less than `u_bound` ($.246 < 2.48$)

**Add `X_4`**

In [16]:
stepwise_out_dict = get_stepwise_regress_table(df_minitab, predictors=["X_2", "X_1", "X_4"])
stepwise_out_dict


{'b_arr': [-86.66563588347951,
  5.925374777116987,
  0.1996032723723494,
  0.013497438664147612],
 'l_u_bound': [-2.0555294386428713, 2.055529438642871],
 't_scores': array([8.97280457, 7.66551432, 0.01823631]),
 'p_values': array([1.92610334e-09, 3.90272431e-08, 9.85589563e-01]),
 's_yxs': 3.8250865182071268,
 'R_sq': 0.8948058903669457,
 'adj_R_sq': 0.8826681084862087}

`t_score` of `X_4` is not significant because it is less than `u_bound` ($.018 < 2.48$)


**Add `X_5`**

In [17]:
stepwise_out_dict = get_stepwise_regress_table(df_minitab, predictors=["X_2", "X_1", "X_5"])
stepwise_out_dict

{'b_arr': [-89.41503185705032,
  6.115969323111187,
  0.2018115766766686,
  -0.5922575098269571],
 'l_u_bound': [-2.0555294386428713, 2.055529438642871],
 't_scores': array([ 7.94222859,  7.87189447, -0.35556039]),
 'p_values': array([2.02585848e-08, 2.39097951e-08, 7.25038367e-01]),
 's_yxs': 3.8158450904814796,
 'R_sq': 0.8953135753067099,
 'adj_R_sq': 0.883234372457484}

`t_score` of `X_5` is not significant because it is greater than `l_bound` ($-2.048 < -0.356$)