# Example 9 and 10

In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tabulate
import scipy.stats as sc_stats
import itertools

from IPython.display import display

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({
  'font.size': 12,
  'grid.alpha': 0.25})

## Function declarations



### Read `ods` to `DataFrame`

In [3]:
def read_ods(filename, columns=None):
  if columns is None:
    df = pd.read_excel(filename, engine="odf", header=0)
  elif isinstance(columns, list):
    df = pd.read_excel(filename, engine="odf", header=0,
                       usecols=columns)
  else:
    key_arr = (k for k in columns.keys())
    df = pd.read_excel(filename, engine="odf", header=0,
                      usecols=key_arr)
    df = df.rename(columns=columns)
  return df

### Get multiple regression coefficients

In [34]:
def get_multiple_regress_coeff(df, n_vars=2):
  
  if isinstance(df, pd.DataFrame):
    num_of_samples = len(df)
    X_arr = np.zeros((num_of_samples, n_vars))
    Y = df["Y"].to_numpy()

    for i in range(n_vars):
      key = f"X_{i+1}"
      X_arr[:, i] = df[key].to_numpy()

    regress = LinearRegression().fit(X_arr, Y)
    df["hat_Y"] = regress.intercept_ + X_arr.dot(regress.coef_) 
  
  elif isinstance(df, np.ndarray):
    Y = df[:, 0]
    X_arr = df[:, 1:]


    regress = LinearRegression().fit(X_arr, Y)
    Y_hat = regress.intercept_ + X_arr.dot(regress.coef_)
    df = np.column_stack([df, Y_hat])

  return regress, df


### Get sum of square of $[Y, \mathbf{X}]$

In [35]:
def get_sumSq(data_struct, n_vars=2):
  regress, _ = get_multiple_regress_coeff(data_struct, n_vars=n_vars)

  hat_Y =  regress.intercept_ + data_struct[:,1:].dot(regress.coef_)
  
  meanY = data_struct[:, 0].mean()
  SST = ((data_struct[:, 0] - meanY)**2).sum()
  SSR = ((hat_Y - meanY)**2).sum()
  SSE = ((data_struct[:, 0] - hat_Y)**2).sum()

  return SST, SSR, SSE

### Get all possible combination of predictors

In [32]:
def get_all_possible_predictors(df):
  num_of_vars = len(df.columns) - 1
  predictor_vars = [f"X_{i+1}" for i in range(num_of_vars)]

  all_possible_predictors = [[""]]
  for k in range(1, num_of_vars+1):
    combination_predictor_vars = itertools.combinations(predictor_vars, k)
    for predictors in combination_predictor_vars:
      all_possible_predictors.append(list(predictors))
    # all_possible_predictors.append(combination_predictor_vars)
    # print(combination_predictor_vars)

  return all_possible_predictors

### Get coefficient of determination $R^2$

In [49]:
def get_R_sq(df):
  all_possible_predictors = get_all_possible_predictors(df)
  # all_possible_predictors
  num_of_samples = len(df)
  R_sq_predictors = np.zeros(len(all_possible_predictors))

  for i, predictors in enumerate(all_possible_predictors[1:]):
    num_of_vars = len(predictors)
    data_struct = np.zeros([num_of_samples, 1+num_of_vars])
    data_struct[:, 0] = df["Y"]
    for j ,X_j in enumerate(predictors):
      data_struct[:, j+1] = df[X_j]

    sumSqT, sumSqR, _ = get_sumSq(data_struct, n_vars=num_of_vars)
    R_sq_predictors[1+i] = sumSqR/sumSqT

    # print(data_struct)
    # if i > 6:
    #   break
  
  return R_sq_predictors


## Load the data

In [5]:
filename = "06-table-13-zurenko-pharmaceutical-data.ods"

df = read_ods(filename)

df.head()


Unnamed: 0,One Month’s Sales (units),Aptitude Test Score,Age (years),Anxiety Test Score,Experience (years),High School GPA
0,44,10,22.1,4.9,0,2.4
1,47,19,22.5,3.0,1,2.6
2,60,27,23.1,1.5,0,2.8
3,71,31,24.0,0.6,3,2.7
4,61,64,22.6,1.8,2,2.0


## $R^2$ for all possible combination of predictors

In [7]:
df_generic = df.copy()
from_column_names = df_generic.columns.to_list()
print(f"header: {from_column_names}")
to_column_names = ["Y"] + [f"X_{i+1}" for i in range(len(from_column_names) - 1)]


df_generic = df_generic.rename(columns=
  {k: v for k, v in zip(from_column_names, to_column_names)})

df_generic.head()

header: ['One Month’s Sales (units)', 'Aptitude Test Score', 'Age (years)', 'Anxiety Test Score', 'Experience (years)', 'High School GPA']


Unnamed: 0,Y,X_1,X_2,X_3,X_4,X_5
0,44,10,22.1,4.9,0,2.4
1,47,19,22.5,3.0,1,2.6
2,60,27,23.1,1.5,0,2.8
3,71,31,24.0,0.6,3,2.7
4,61,64,22.6,1.8,2,2.0


In [71]:
def get_table_all_regressions(df):
  all_possible_predictors = get_all_possible_predictors(df)

  R_sq_arr = get_R_sq(df)

  # -- create a table for all possible values of R_sq for a given combination of predictor(s)
  num_of_samples = len(df)
  data = [["None", 1, num_of_samples - 1, R_sq_arr[0]]]
  for i, predictors in enumerate(all_possible_predictors[1:]):
    num_of_params = len(predictors) + 1 
    dof = num_of_samples - num_of_params   # degrees of freedom
    data_row = [", ".join(predictors), num_of_params, dof, R_sq_arr[i+1]]
    #print(data_row)

    data.append(data_row)

  table_R_sq = tabulate.tabulate(data, tablefmt='html', 
    headers=["Independent Variables Used", "Number of Parameters",
      "Error Degrees of Freedom", "R^2"], 
    floatfmt=["s", "d", "d", ".4f"])

  display(table_R_sq)

  # -- create dataframe
  df_R_sq = pd.DataFrame()
  df_R_sq["predictor"] = [row[0] for row in data]
  df_R_sq["numOfParams"]  = [row[1] for row in data]
  df_R_sq["d.o.f"] = [row[2] for row in data]
  df_R_sq["R_sq"] = [row[3] for row in data]

  return df_R_sq 

number of parameters for no predictor variables is a constant: $\hat{Y} = b_0$.
This constant is a mean of $Y$.

In [72]:
df_R_sq = get_table_all_regressions(df_generic)

Independent Variables Used,Number of Parameters,Error Degrees of Freedom,R^2
,1,29,0.0
X_1,2,28,0.4571
X_2,2,28,0.637
X_3,2,28,0.0875
X_4,2,28,0.3023
X_5,2,28,0.3866
"X_1, X_2",3,27,0.8948
"X_1, X_3",3,27,0.4795
"X_1, X_4",3,27,0.5691
"X_1, X_5",3,27,0.6413


## "Best" regression equations

In [79]:
for i in range(num_of_vars):
  row_df = df_R_sq[df_R_sq["numOfParams"] == i+1]["R_sq"]
  print(row_df)

  predictor  numOfParams  d.o.f  R_sq
0      None            1     29   0.0
  predictor  numOfParams  d.o.f      R_sq
1       X_1            2     28  0.457139
2       X_2            2     28  0.637028
3       X_3            2     28  0.087533
4       X_4            2     28  0.302317
5       X_5            2     28  0.386615
   predictor  numOfParams  d.o.f      R_sq
6   X_1, X_2            3     27  0.894805
7   X_1, X_3            3     27  0.479489
8   X_1, X_4            3     27  0.569063
9   X_1, X_5            3     27  0.641332
10  X_2, X_3            3     27  0.641914
11  X_2, X_4            3     27  0.657067
12  X_2, X_5            3     27  0.645811
13  X_3, X_4            3     27  0.324373
14  X_3, X_5            3     27  0.408640
15  X_4, X_5            3     27  0.526840
        predictor  numOfParams  d.o.f      R_sq
16  X_1, X_2, X_3            4     26  0.895048
17  X_1, X_2, X_4            4     26  0.894806
18  X_1, X_2, X_5            4     26  0.895314
19  X_1