# 用模拟数据比较PCR和PLSR

## 1. 准备工作

In [1]:
# import numpy as np
import pandas as pd
from scipy.stats import norm
from source.scale import scale
from source.sim import sim
from model.pcr_class import PCR
from model.plsr_class import PLSR

In [2]:
# 输出最优的成分数量和均方误差的函数
def best_result_report(pcr_error, plsr_error):
    # pcr结果
    pcr_id = pcr_error.iloc[:,0].idxmin()
    pcr_error = pcr_error.iloc[:,0].min()
    
    # plsr结果
    plsr_id = plsr_error.iloc[:,0].idxmin()
    plsr_error = plsr_error.iloc[:,0].min()
    
    return pd.DataFrame({'# of components':[(pcr_id+1), (plsr_id+1)], 'err_mean':[pcr_error, plsr_error]}, index=['pcr','plsr'])

In [6]:
def comparison(beta0, beta1, p=10, rho=0.5):
    # 模拟的训练数据
    n_train = 1000
    mu = norm.rvs(size=p, scale=1)
    
    x_train, y_train = sim(n_train, p, rho, mu, beta0, beta1)
    
    # 测试数据
    n_test = 100
    x_test, y_test = sim(n_test, p, rho, mu, beta0, beta1)
    names = list(range(p))

    # PCR error
    pcr1 = PCR(x_train, y_train, names, is_scale=True, is_var_exp=True)
    pcr1.pcr()
    pcr_error = pcr1.all_error(x_test, y_test)

    # PLSR error
    plsr1 = PLSR(x_train, y_train, names, is_scale=True, is_var_exp=True)
    plsr1.plsr()
    plsr_error = plsr1.all_error(x_test, y_test)
    
    # print the best result
    output = best_result_report(pcr_error, plsr_error)
    
    return output, pcr_error, plsr_error

## 2. 模型比较

### 2.1 改变ρ的大小

In [23]:
# 20个变量，ρ为0.2，beta为0.5
beta0, beta1 = 0.5, 0.5 * np.ones(20, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=20, rho=0.2)
compare

Unnamed: 0,# of components,err_mean
pcr,20,1.169064
plsr,11,1.169064


In [25]:
# 20个变量，ρ为0.5，beta为0.5
beta0, beta1 = 0.5, 0.5 * np.ones(20, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=20, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,20,1.036938
plsr,9,1.036869


In [26]:
# 20个变量，ρ为0.8，beta为0.5
beta0, beta1 = 0.5, 0.5 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.8)
compare

Unnamed: 0,# of components,err_mean
pcr,10,0.901291
plsr,9,0.901291


结论: 当变量之间的**相关系数增大**，所选出的成分数量也越来越少

### 2.2 改变p的大小（变量个数）

In [42]:
# 20个变量，ρ为0.5，beta为0.5
beta0, beta1 = 0.5, 0.5 * np.ones(20, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=20, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,7,1.5223
plsr,2,1.55351


In [43]:
# 50个变量，ρ为0.5，beta为0.5
beta0, beta1 = 0.5, 0.5 * np.ones(50, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=50, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,17,0.762844
plsr,3,0.784744


In [44]:
# 100个变量，ρ为0.5，beta为0.5
beta0, beta1 = 0.5, 0.5 * np.ones(100, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=100, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,73,0.879095
plsr,4,0.898751


结论：随着变量数增多以后，主成分回归得到的变量数开始显著多于偏最小二乘回归
- 当变量为20个时，偏最小二乘回归有2个，而主成分回归选出了7个
- 当变量达到100个时，偏最小二乘回归稳定在10个以内，主成分回归却达到了73个

### 2.3 改变β的大小

In [45]:
# 10个变量，ρ为0.5，β为0.01
beta0, beta1 = 1, 0.01 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,10,0.86889
plsr,5,0.868676


In [46]:
# 10个变量，ρ为0.5，β为0.1
beta0, beta1 = 1, 0.1 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,4,0.735293
plsr,1,0.741542


In [48]:
# 10个变量，ρ为0.5，β为0.5
beta0, beta1 = 1, 0.5 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,6,0.793092
plsr,4,0.799774


In [49]:
# 10个变量，ρ为0.5，β为1
beta0, beta1 = 1, 1 * np.ones(10, dtype=float)
compare, pcr_error, plsr_error = comparison(beta0, beta1, p=10, rho=0.5)
compare

Unnamed: 0,# of components,err_mean
pcr,8,1.110821
plsr,4,1.138068


> 结论：改变β对成分选取个数的影响并不大