In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tabulate
import scipy.stats as sc_stats

from IPython.display import display

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({
  'font.size': 12,
  'grid.alpha': 0.25})

## Function declarations



### Read `ods` to `DataFrame`

In [3]:
def read_ods(filename, columns=None):
  if columns is None:
    df = pd.read_excel(filename, engine="odf", header=0)
  elif isinstance(columns, list):
    df = pd.read_excel(filename, engine="odf", header=0,
                       usecols=columns)
  else:
    key_arr = (k for k in columns.keys())
    df = pd.read_excel(filename, engine="odf", header=0,
                      usecols=key_arr)
    df = df.rename(columns=columns)
  return df

### Get Minitab output correlation

In [41]:
def get_minitab_corr(df, n_vars=2, new_column=None):
  """
  new_column (dict) : a dictionary mapping to rename df header
  """
  
  data_struct = np.zeros((len(df), 1+n_vars))
  data_struct[:, 0] = df['Y']
  for i in range(n_vars):
    key = f"X_{i+1}"
    data_struct[:, i+1] = df[key]

  # -- compute correlation matrix
  corr_matrix = np.corrcoef(data_struct, rowvar=False)
  corr_matrix = corr_matrix[1:, 0:-1]    # only show lower triangle part of correlation matrix

  # -- create tabular form for correlation 
  data = []
  for i in range(n_vars):
    data_row = [new_column[i+1]] + \
      [f"{corr:.3f}" if j < i+1 else "" for j, corr in enumerate(corr_matrix[i,:])]
    #print(data_row)
    #print([type(data_row_i) for data_row_i in data_row])
    data.append(data_row)

  table_corr = tabulate.tabulate(data, tablefmt='html', 
    headers=[""] + new_column[:n_vars], 
    floatfmt=["None"] + [".3f"]*(n_vars))

  print(f"Correlations: {', '.join(new_column)}")
  display(table_corr)


## Load the data

In [38]:
filename = "06-table-13-zurenko-pharmaceutical-data.ods"

df = read_ods(filename)

df.head()


Unnamed: 0,One Month’s Sales (units),Aptitude Test Score,Age (years),Anxiety Test Score,Experience (years),High School GPA
0,44,10,22.1,4.9,0,2.4
1,47,19,22.5,3.0,1,2.6
2,60,27,23.1,1.5,0,2.8
3,71,31,24.0,0.6,3,2.7
4,61,64,22.6,1.8,2,2.0


## Observation to the correlation matrix

In [39]:
df_minitab = df.copy()
from_column_names = df_minitab.columns.to_list()
print(f"header: {from_column_names}")
to_column_names = ["Y"] + [f"X_{i+1}" for i in range(len(from_column_names) - 1)]


df_minitab = df_minitab.rename(columns=
  {k: v for k, v in zip(from_column_names, to_column_names)})

df_minitab.head()

header: ['One Month’s Sales (units)', 'Aptitude Test Score', 'Age (years)', 'Anxiety Test Score', 'Experience (years)', 'High School GPA']


Unnamed: 0,Y,X_1,X_2,X_3,X_4,X_5
0,44,10,22.1,4.9,0,2.4
1,47,19,22.5,3.0,1,2.6
2,60,27,23.1,1.5,0,2.8
3,71,31,24.0,0.6,3,2.7
4,61,64,22.6,1.8,2,2.0


In [42]:
column_name = ["Sales", "Aptitude", "Age", "Anxiety", "Experience", "GPA"]
out_dict = get_minitab_corr(df_minitab, n_vars=len(column_name)-1, 
  new_column=column_name)


Correlations: Sales, Aptitude, Age, Anxiety, Experience, GPA


Unnamed: 0,Sales,Aptitude,Age,Anxiety,Experience
Aptitude,0.676,,,,
Age,0.798,0.228,,,
Anxiety,-0.296,-0.222,-0.287,,
Experience,0.55,0.35,0.54,-0.279,
GPA,0.622,0.318,0.695,-0.244,0.312


Examination of the correlation matrix above reveals that
the selling aptitude test score, age, experience, and GPA
are positively related to sales ability and have potential
as good predictor variables.
The anxiety score shows a low negative correlation with sales,
and it is probabily not an important predictor.
Further analysis indicates that age is moderately correlated
with both GPA and experience. It is the presence of these
interrelationship that must be dealt with in attempting 
to find the best possible set of explanatory variables.