# Use genreated data to compare PCR and PLSR

## 1. Preparation

In [6]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from source.scale import scale
from source.sim import sim
from model.pcr_class import PCR
from model.plsr_class import PLSR

In [7]:
# Output the optimal function of the number of components and the mean square error
def best_result_report(pcr_error, plsr_error):
    # pcr
    pcr_id = pcr_error.iloc[:,0].idxmin()
    pcr_error = pcr_error.iloc[:,0].min()
    
    # plsr
    plsr_id = plsr_error.iloc[:,0].idxmin()
    plsr_error = plsr_error.iloc[:,0].min()
    
    return pd.DataFrame({'# of components':[(pcr_id+1), (plsr_id+1)], 'err_mean':[pcr_error, plsr_error]}, index=['pcr','plsr'])

In [8]:
def comparison(beta0, beta1, p=10, rho=0.5):
    # generate training data
    n_train = 1000
    mu = norm.rvs(size=p, scale=1)
    x_train, y_train = sim(n_train, p, rho, mu, beta0, beta1)
    
    # test data
    n_test = 100
    x_test, y_test = sim(n_test, p, rho, mu, beta0, beta1)
    names = list(range(p))

    # PCR error
    pcr1 = PCR(x_train, y_train, names, is_scale=True, is_var_exp=True)
    pcr1.pcr()
    pcr_error = pcr1.all_error(x_test, y_test)

    # PLSR error
    plsr1 = PLSR(x_train, y_train, names, is_scale=True, is_var_exp=True)
    plsr1.plsr()
    plsr_error = plsr1.all_error(x_test, y_test)
    
    # print the best result
    output = best_result_report(pcr_error, plsr_error)
    
    return output, pcr_error, plsr_error

## 2. Model comparison under parameters changing

### 2.1 Change correlation

In [9]:
# 20 features ρ = 0.2，beta = 0.5
beta0, beta1 = 0.5, 0.5 * np.ones(20, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=20, rho=0.2)
compare

Unnamed: 0,# of components,err_mean
pcr,19,0.914148
plsr,4,0.913902


In [10]:
# 20 features ρ = 0.5，beta = 0.5
beta0, beta1 = 0.5, 0.5 * np.ones(20, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=20, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,17,0.888086
plsr,3,0.901222


In [11]:
# 20 features ρ = 0.8，beta = 0.5
beta0, beta1 = 0.5, 0.5 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.8)
compare

Unnamed: 0,# of components,err_mean
pcr,5,1.010118
plsr,2,1.002697


Conclusion: it seems like when the **correlation** between variables increases, the number of selected components decrease

### 2.2 Fixed correlatioin, change p(# of variables)

In [12]:
# 20 features ρ = 0.5，beta = 0.5
beta0, beta1 = 0.5, 0.5 * np.ones(20, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=20, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,8,1.077221
plsr,2,1.089707


In [13]:
# 50 features ρ = 0.5，beta = 0.5
beta0, beta1 = 0.5, 0.5 * np.ones(50, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=50, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,34,0.800413
plsr,2,0.815886


In [14]:
# 100 features ρ = 0.5，beta = 0.5
beta0, beta1 = 0.5, 0.5 * np.ones(100, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=100, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,64,0.738692
plsr,4,0.773934


Conclusion: after regression, the number of partial variables increased significantly
- When the variables were 20, there were 2 partial least squares regression and 7 principal component regression
- When the number of variables reaches 100, the partial least squares regression is stable within 10, while the principal component regression is 73

### 2.3 Change β

In [45]:
# 10 features ρ = 0.5，beta = 0.01
beta0, beta1 = 1, 0.01 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,10,0.86889
plsr,5,0.868676


In [15]:
# 10 features ρ = 0.5，beta = 0.1
beta0, beta1 = 1, 0.1 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,6,1.013254
plsr,1,1.01874


In [16]:
# 10 features ρ = 0.5，beta = 0.5
beta0, beta1 = 1, 0.5 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,8,0.910174
plsr,4,0.913461


In [17]:
# 10 features ρ = 0.5，beta = 1
beta0, beta1 = 1, 1 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,5,0.830095
plsr,2,0.832555


> Conclusion: the change of β has little effect on the number of components