In [1]:
from numpy import *

In [2]:
def loadDataSet(fileName, delim='\t'):
    '''
    导入数据
    输入：文件名filename，分隔符 \t
    输出：数据集
    '''
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    datArr = [list(map(float,line)) for line in stringArr]
    return mat(datArr)

## PCA
* 通过沿着数据最大方差方向旋转坐标轴来实现
    * 选择方差最大的方向作为第一条坐标轴
    * 后续坐标轴选择与前面坐标轴正交，且方差最大的方向

In [3]:
def pca(dataMat, topNfeat=9999999):
    '''
    数据降维
    输入：原始数据dataMat，特征数topNfeat
    输出：降维后的数据lowDDataMat，重构的数据reconMat
    '''
    meanVals = mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals 
    covMat = cov(meanRemoved, rowvar=0)
    eigVals,eigVects = linalg.eig(mat(covMat))
    eigValInd = argsort(eigVals)                 # 按特征值大小对特征向量从小到大排序
    eigValInd = eigValInd[:-(topNfeat+1):-1]     # 取前n个最大特征值对应的特征向量
    redEigVects = eigVects[:,eigValInd]       
    lowDDataMat = meanRemoved * redEigVects      # 将数据转到新的空间
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat

In [4]:
def replaceNanWithMean(): 
    '''
    用平均值代替空值
    '''
    datMat = loadDataSet('data/secom.data', ' ')
    numFeat = shape(datMat)[1]
    for i in range(numFeat):
        meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i])  #values that are not NaN (a number)
        datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal        #set NaN values to mean
    return datMat

In [9]:
datMat = replaceNanWithMean()
meanVals = mean(datMat, axis=0)
meanRemoved = datMat - meanVals 
covMat = cov(meanRemoved, rowvar=0)
eigVals,eigVects = linalg.eig(mat(covMat))
print(eigVects)                              # 观察特征向量的大小
print(len(eigVects))

[[-6.39070760e-04 -1.20314234e-04  1.22460363e-04 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 2.35722934e-05 -6.60163227e-04  1.71369126e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 2.36801459e-04  1.58026311e-04  3.28185512e-04 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 2.61329351e-08 -6.06233975e-09  1.09328336e-09 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 5.62597732e-09  5.96647587e-09  8.83024927e-09 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 3.89298443e-04 -2.32070657e-04  7.13534990e-04 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
590
