PLS (Partial least squares) regression is a statistical method that bears some relation to principal components regression; 
instead of finding hyperplanes of maximum variance between the response and independent variables, 
it finds a linear regression model by projecting the predicted variables and the observable variables 
to a new space. Because both the X and Y data are projected to new spaces, the PLS family of methods 
are known as bilinear factor models. Partial least squares Discriminant Analysis (PLS-DA) is a variant 
used when the Y is binary.

In [1]:
# import required packages
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression

In [3]:
# fetch data
data = pd.read_csv('kamyr-digester.csv', usecols = range(1,23))

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Y-Kappa          301 non-null    float64
 1   ChipRate         297 non-null    float64
 2   BF-CMratio       287 non-null    float64
 3   BlowFlow         288 non-null    float64
 4   ChipLevel4       300 non-null    float64
 5   T-upperExt-2     300 non-null    float64
 6   T-lowerExt-2     300 non-null    float64
 7   UCZAA            277 non-null    float64
 8   WhiteFlow-4      300 non-null    float64
 9   AAWhiteSt-4      160 non-null    float64
 10  AA-Wood-4        300 non-null    float64
 11  ChipMoisture-4   300 non-null    float64
 12  SteamFlow-4      300 non-null    float64
 13  Lower-HeatT-3    300 non-null    float64
 14  Upper-HeatT-3    300 non-null    float64
 15  ChipMass-4       300 non-null    float64
 16  WeakLiquorF      300 non-null    float64
 17  BlackFlow-2     

In [6]:
# find the # of nan entries in each column
na_counts = data.isna().sum(axis = 0)
na_counts

Y-Kappa              0
ChipRate             4
BF-CMratio          14
BlowFlow            13
ChipLevel4           1
T-upperExt-2         1
T-lowerExt-2         1
UCZAA               24
WhiteFlow-4          1
AAWhiteSt-4        141
AA-Wood-4            1
ChipMoisture-4       1
SteamFlow-4          1
Lower-HeatT-3        1
Upper-HeatT-3        1
ChipMass-4           1
WeakLiquorF          1
BlackFlow-2          1
WeakWashF            1
SteamHeatF-3         1
T-Top-Chips-4        1
SulphidityL-4      141
dtype: int64

In [7]:
# remove columns that have a lot of nan entries
data_cleaned = data.drop(columns = ['AAWhiteSt-4 ','SulphidityL-4 '])
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Y-Kappa          301 non-null    float64
 1   ChipRate         297 non-null    float64
 2   BF-CMratio       287 non-null    float64
 3   BlowFlow         288 non-null    float64
 4   ChipLevel4       300 non-null    float64
 5   T-upperExt-2     300 non-null    float64
 6   T-lowerExt-2     300 non-null    float64
 7   UCZAA            277 non-null    float64
 8   WhiteFlow-4      300 non-null    float64
 9   AA-Wood-4        300 non-null    float64
 10  ChipMoisture-4   300 non-null    float64
 11  SteamFlow-4      300 non-null    float64
 12  Lower-HeatT-3    300 non-null    float64
 13  Upper-HeatT-3    300 non-null    float64
 14  ChipMass-4       300 non-null    float64
 15  WeakLiquorF      300 non-null    float64
 16  BlackFlow-2      300 non-null    float64
 17  WeakWashF       

In [8]:
# remove any row that have any nan entry
data_cleaned = data_cleaned.dropna(axis = 0)
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 0 to 297
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Y-Kappa          263 non-null    float64
 1   ChipRate         263 non-null    float64
 2   BF-CMratio       263 non-null    float64
 3   BlowFlow         263 non-null    float64
 4   ChipLevel4       263 non-null    float64
 5   T-upperExt-2     263 non-null    float64
 6   T-lowerExt-2     263 non-null    float64
 7   UCZAA            263 non-null    float64
 8   WhiteFlow-4      263 non-null    float64
 9   AA-Wood-4        263 non-null    float64
 10  ChipMoisture-4   263 non-null    float64
 11  SteamFlow-4      263 non-null    float64
 12  Lower-HeatT-3    263 non-null    float64
 13  Upper-HeatT-3    263 non-null    float64
 14  ChipMass-4       263 non-null    float64
 15  WeakLiquorF      263 non-null    float64
 16  BlackFlow-2      263 non-null    float64
 17  WeakWashF        263 

In [9]:
# separate X, y
y = data_cleaned.iloc[:,0].values[:, np.newaxis] # StandardScaler requires 2D array
X = data_cleaned.iloc[:,1:].values

In [10]:
print('Number of samples left: ', X.shape[0])

Number of samples left:  263


In [11]:
# separate training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [12]:
#scale data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler()
X_train_normal, X_test_normal = X_scaler.fit_transform(X_train), X_scaler.transform(X_test)
y_scaler = StandardScaler()
y_train_normal, y_test_normal = y_scaler.fit_transform(y_train), y_scaler.transform(y_test)

In [13]:
# PLS model
pls = PLSRegression(n_components = 9)
pls.fit(X_train_normal, y_train_normal)

In [14]:
# Training vs Test accuracy
y_train_normal_predict = pls.predict(X_train_normal)
y_test_normal_predict = pls.predict(X_test_normal)

In [15]:
print('Accuracy over training data: ', pls.score(X_train_normal, y_train_normal))
print('Accuracy over test data: ', pls.score(X_test_normal, y_test_normal))

Accuracy over training data:  0.6615034210369084
Accuracy over test data:  0.6815746388199424


In [16]:
# import required packages 
from sklearn.model_selection import KFold 
from sklearn.metrics import mean_squared_error

scaler = StandardScaler() 
fit_MSE = [] 
validate_MSE = [] 

for n_comp in range(1,20):
    
    local_fit_MSE = [] # store MSE for each fold 
    local_validate_MSE = []

    kfold = KFold(n_splits = 10, shuffle = True, random_state = 100) 
    for fit_index, validate_index in kfold.split(y_train):

        X_fit_normal = scaler.fit_transform(X_train[fit_index]) 
        X_validate_normal = scaler.transform(X_train[validate_index])
        
        y_fit_normal = scaler.fit_transform(y_train[fit_index]) 
        y_validate_normal = scaler.transform(y_train[validate_index])
        
        pls = PLSRegression(n_components = n_comp) 
        pls.fit(X_fit_normal, y_fit_normal)

        local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal))) 
        local_validate_MSE.append(mean_squared_error(y_validate_normal, pls.predict(X_validate_normal)))
    
    fit_MSE.append(np.mean(local_fit_MSE)) 
    validate_MSE.append(np.mean(local_validate_MSE))