In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
import jieba
import numpy as np
from sklearn.impute import SimpleImputer

# 1 DictVectorizer示例：将字典列表转换为特征矩阵

In [7]:
# DictVectorizer示例：将字典列表转换为特征矩阵

# 创建一个字典列表，每个字典代表一个样本
data = [
    {'城市': '北京', '温度': 100, '湿度': 60, '风力': '强'},
    {'城市': '上海', '温度': 90, '湿度': 70, '风力': '中'},
    {'城市': '深圳', '温度': 110, '湿度': 80, '风力': '弱'}
]

# 初始化DictVectorizer，设置sparse=False返回密集矩阵而非稀疏矩阵
#Sparse – Whether transform should produce scipy. sparse matrices
# if Sparse is False, This returns a scipy sparse matrix (saves memory for large data).
dict_vec = DictVectorizer(sparse=False)

# 转换数据
#fit_transform 's parameters: X – Dict(s) or Mapping(s) from feature names (arbitrary Python objects) to feature values (strings or convertible to dtype).
"""
fit_transform(X) 
.fit(data) → Look at all the keys in your dicts to learn what features exist.

.transform(data) → Actually convert the dicts into a numeric matrix.
"""
feature_matrix = dict_vec.fit_transform(data)

# 查看特征名称
# get_feature_names_out() – Get output feature names for transformation
feature_names = dict_vec.get_feature_names_out()

print("特征名称:", feature_names)
print("特征矩阵:\n", feature_matrix)
print("特征矩阵形状:", feature_matrix.shape)
print('特征类型',type(feature_matrix))

# 将特征矩阵转换回字典形式
# inverse_transform:Transform array or sparse matrix X back to feature mappings
original_data = dict_vec.inverse_transform(feature_matrix)
print("\n转换回的字典数据:")
for item in original_data:
    print(item)


特征名称: ['城市=上海' '城市=北京' '城市=深圳' '温度' '湿度' '风力=中' '风力=弱' '风力=强']
特征矩阵:
 [[  0.   1.   0. 100.  60.   0.   0.   1.]
 [  1.   0.   0.  90.  70.   1.   0.   0.]
 [  0.   0.   1. 110.  80.   0.   1.   0.]]
特征矩阵形状: (3, 8)
特征类型 <class 'numpy.ndarray'>

转换回的字典数据:
{'城市=北京': 1.0, '温度': 100.0, '湿度': 60.0, '风力=强': 1.0}
{'城市=上海': 1.0, '温度': 90.0, '湿度': 70.0, '风力=中': 1.0}
{'城市=深圳': 1.0, '温度': 110.0, '湿度': 80.0, '风力=弱': 1.0}


# 2CountVectorizer英文示例：将英文文本转换为词频矩阵

In [8]:
# CountVectorizer英文示例：将英文文本转换为词频矩阵

# 准备一些简单的英文文本数据
english_texts = [
    "Machine learning is a branch of artificial intelligence",
    "Deep learning is a method of machine learning",
    "Natural language processing is an important application of artificial intelligence"
]

# 初始化CountVectorizer,空格，标点符号，都认为是分隔符，单个字母，认为没有语义
english_count_vec = CountVectorizer()
print(english_count_vec )
# 转换文本数据为词频矩阵
# fit_transform : will execute both fit and transform on the data. fit will learn and split it into vocabulary; transform: For each sentence, counts how many times each vocabulary word appears
english_X = english_count_vec.fit_transform(english_texts)

# 获取特征名称（词汇表）
# get_feature_names_out() – Get output feature names for transformation
english_vocabulary = english_count_vec.get_feature_names_out()

# 将稀疏矩阵转换为密集矩阵以便于显示
#toarray make us able to see the matrix; otherwise, it's an output with two columns:
# Coords(coordinate: row and column) and Values( how many times the word appears in the document)
english_X_dense = english_X.toarray()

print("英文词汇表:", english_vocabulary)
print("英文词频矩阵:\n", english_X_dense)
print("英文矩阵形状:", english_X_dense.shape)

# 分析结果
print("\n文档-词条矩阵解释:")
# enumerate will give us both the index and the value of each item in the list
for i, doc in enumerate(english_texts):
    print(f"文档 {i+1}: {doc}")
    print("包含的词条:")
    for j, term in enumerate(english_vocabulary):
        if english_X_dense[i, j] > 0:
            print(f"  - '{term}' 出现 {english_X_dense[i, j]} 次")
    print()


CountVectorizer()
英文词汇表: ['an' 'application' 'artificial' 'branch' 'deep' 'important'
 'intelligence' 'is' 'language' 'learning' 'machine' 'method' 'natural'
 'of' 'processing']
英文词频矩阵:
 [[0 0 1 1 0 0 1 1 0 1 1 0 0 1 0]
 [0 0 0 0 1 0 0 1 0 2 1 1 0 1 0]
 [1 1 1 0 0 1 1 1 1 0 0 0 1 1 1]]
英文矩阵形状: (3, 15)

文档-词条矩阵解释:
文档 1: Machine learning is a branch of artificial intelligence
包含的词条:
  - 'artificial' 出现 1 次
  - 'branch' 出现 1 次
  - 'intelligence' 出现 1 次
  - 'is' 出现 1 次
  - 'learning' 出现 1 次
  - 'machine' 出现 1 次
  - 'of' 出现 1 次

文档 2: Deep learning is a method of machine learning
包含的词条:
  - 'deep' 出现 1 次
  - 'is' 出现 1 次
  - 'learning' 出现 2 次
  - 'machine' 出现 1 次
  - 'method' 出现 1 次
  - 'of' 出现 1 次

文档 3: Natural language processing is an important application of artificial intelligence
包含的词条:
  - 'an' 出现 1 次
  - 'application' 出现 1 次
  - 'artificial' 出现 1 次
  - 'important' 出现 1 次
  - 'intelligence' 出现 1 次
  - 'is' 出现 1 次
  - 'language' 出现 1 次
  - 'natural' 出现 1 次
  - 'of' 出现 1 次
  - 'process

In [9]:
# jieba分词示例

# 准备一些中文文本
text = "我爱北京天安门，天安门上太阳升。"

# 默认分词
print("默认分词:") #jieba分词内部有很多分词算法
#
"""The main function that segments an entire sentence that contains Chinese characters into separated words.
Parameter:
sentence: The str(unicode) to be segmented.
cut_all: Model type. True for full pattern, False for accurate pattern."""
seg_list = jieba.cut(text, cut_all=False) #返回的是迭代器
print("默认模式: " + " ".join(seg_list))

# # 全模式分词
# print("\n全模式分词:")
# seg_list = jieba.cut(text, cut_all=True)
# print("全模式: " + "/ ".join(seg_list))

# # 搜索引擎模式
# print("\n搜索引擎模式:")
# seg_list = jieba.cut_for_search(text)
# print("搜索引擎模式: " + "/ ".join(seg_list))

# # 添加自定义词典
# print("\n添加自定义词典:")
# jieba.add_word("天安门上")
# seg_list = jieba.cut(text, cut_all=False)
# print("添加自定义词典后: " + "/ ".join(seg_list))

# # 词性标注
# print("\n词性标注:")
# import jieba.posseg as pseg
# words = pseg.cut(text)
# print("词性标注结果:")
# for word, flag in words:
#     print(f"{word} ({flag})")


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/nt/z2_pm9xj1bl1zz8q88mshrpc0000gn/T/jieba.cache


默认分词:


Loading model cost 0.374 seconds.
Prefix dict has been built successfully.


默认模式: 我 爱 北京 天安门 ， 天安门 上 太阳升 。


In [22]:
# CountVectorizer示例：将文本转换为词频矩阵

# 准备一些简单的中文文本数据
texts = [
    "机器学习是人工智能的一个分支",
    "深度学习是机器学习的一种方法",
    "自然语言处理是人工智能的重要应用"
]

segmented_texts = [" ".join(jieba.cut(text, cut_all=False)) for text in texts]

print(segmented_texts)
# 初始化CountVectorizer
count_vec = CountVectorizer()
# 转换文本数据为词频矩阵
X = count_vec.fit_transform(texts)
# 获取特征名称（词汇表）
vocabulary = count_vec.get_feature_names_out()
# 将稀疏矩阵转换为密集矩阵以便于显示
X_dense = X.toarray()

#using CountVectorizer without applying jieba segmentation first. 
# That means it treats each entire sentence as one word (token), 
# because Chinese does not have spaces like English — so no natural token boundaries.
# each full sentence is treated as a single token (like one long English word).

print("词汇表:", vocabulary)
print('\n',X)
print("词频矩阵:\n", X_dense)
print("矩阵形状:", X_dense.shape)


['机器 学习 是 人工智能 的 一个 分支', '深度 学习 是 机器 学习 的 一种 方法', '自然语言 处理 是 人工智能 的 重要 应用']
词汇表: ['机器学习是人工智能的一个分支' '深度学习是机器学习的一种方法' '自然语言处理是人工智能的重要应用']

 <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3 stored elements and shape (3, 3)>
  Coords	Values
  (0, 0)	1
  (1, 1)	1
  (2, 2)	1
词频矩阵:
 [[1 0 0]
 [0 1 0]
 [0 0 1]]
矩阵形状: (3, 3)


In [16]:
# 使用jieba分词器处理中文
print("\n使用jieba分词的CountVectorizer:")
#analyze function： If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.
#min_df= minimum document frequency: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. 
# this means all terms that appear in less than 2 documents(2 documents rather than 2 words in a document)will be ignored.

count_vec_jieba = CountVectorizer(analyzer=lambda x: jieba.cut(x),min_df=2)
# fit_transform Returns ------- X : array of shape (n_samples, n_features) Document-term matrix. so the returned matrix will be passed tp CountVectorizer as input.

"""
# In .fit_transform(), the method first calls _count_vocab()
# Inside _count_vocab(), it calls self.build_analyzer()
# self.build_analyzer() returns a function (_analyze with partial arguments)
# _analyze() checks if an analyzer is provided (not None)
in  _analyze(), we can see:
if analyzer is not None:
    doc = analyzer(doc)

# The processed doc is then returned
return doc

  # Since analyzer is a lambda function, it gets called here
    # The input doc (from texts in fit_transform) is passed into the lambda
    # The lambda function processes the text (e.g., using jieba.cut) and returns the result
"""
X_jieba = count_vec_jieba.fit_transform(texts)

vocabulary_jieba = count_vec_jieba.get_feature_names_out()
#['机器 学习 是 人工智能 的 一个 分支', '深度 学习 是 机器 学习 的 一种 方法', '自然语言 处理 是 人工智能 的 重要 应用']

print("jieba分词词汇表:", vocabulary_jieba)
print("jieba分词词频矩阵:\n", X_jieba.toarray())
print("jieba分词矩阵形状:", X_jieba.toarray().shape)


使用jieba分词的CountVectorizer:
jieba分词词汇表: ['人工智能' '学习' '是' '机器' '的']
jieba分词词频矩阵:
 [[1 1 1 1 1]
 [0 2 1 1 1]
 [1 0 1 0 1]]
jieba分词矩阵形状: (3, 5)


In [33]:
def my_analyzer(x):
    print("当前处理：", x)
    return jieba.cut(x)

vec = CountVectorizer(analyzer=my_analyzer)
vec.fit_transform(texts)

"""
# In .fit_transform(), the method first calls _count_vocab()
# Inside _count_vocab(), it calls self.build_analyzer()
# self.build_analyzer() returns a function (_analyze with partial arguments)
# _analyze() checks if an analyzer is provided (not None)
# in _analyze(), we can see:
if analyzer is not None:
    doc = analyzer(doc)

# The processed doc is then returned
return doc

# Since analyzer is my_analyzer function, it gets called here
# The input doc (from texts in fit_transform) is passed into the my_analyzer
# The my_analyzer function processes the text (e.g., using jieba.cut) and returns the result
"""


当前处理： 机器学习是人工智能的一个分支
当前处理： 深度学习是机器学习的一种方法
当前处理： 自然语言处理是人工智能的重要应用


'\n# In .fit_transform(), the method first calls _count_vocab()\n# Inside _count_vocab(), it calls self.build_analyzer()\n# self.build_analyzer() returns a function (_analyze with partial arguments)\n# _analyze() checks if an analyzer is provided (not None)\nin  _analyze(), we can see:\nif analyzer is not None:\n    doc = analyzer(doc)\n\n# The processed doc is then returned\nreturn doc\n\n  # Since analyzer is a lambda function, it gets called here\n    # The input doc (from texts in fit_transform) is passed into the lambda\n    # The lambda function processes the text (e.g., using jieba.cut) and returns the result\n'

TypeError: say_hello() missing 1 required positional argument: 'name'

# 3 tfidf

In [38]:
#TF-IDF 主要目的：找出词频很高，但又具有独特性的单词（没有在很多文档中出现，仅在部分文档中出现达到了很高词频）
# TF-IDF处理示例：将文本转换为TF-IDF特征矩阵
# 使用jieba分词器进行TF-IDF处理
print("\n使用jieba分词的TF-IDF处理:")
tfidf_vec_jieba = TfidfVectorizer(analyzer=lambda x: jieba.cut(x))
X_tfidf_jieba = tfidf_vec_jieba.fit_transform(texts)
#Get output feature names for transformation.
tfidf_vocabulary_jieba = tfidf_vec_jieba.get_feature_names_out()

print("jieba分词词汇表:", tfidf_vocabulary_jieba)
# print('TF-IDF特征矩阵没有toarray',X_tfidf_jieba)
"""
texts=['机器学习是人工智能的一个分支', '深度学习是机器学习的一种方法', '自然语言处理是人工智能的重要应用']
"""
print("jieba分词TF-IDF特征矩阵:\n", X_tfidf_jieba.toarray())
print("jieba分词矩阵形状:", X_tfidf_jieba.toarray().shape)


使用jieba分词的TF-IDF处理:
jieba分词词汇表: ['一个' '一种' '人工智能' '分支' '处理' '学习' '应用' '方法' '是' '机器' '深度' '的' '自然语言' '重要']
jieba分词TF-IDF特征矩阵:
 [[0.47496141 0.         0.3612204  0.47496141 0.         0.3612204
  0.         0.         0.28051986 0.3612204  0.         0.28051986
  0.         0.        ]
 [0.         0.38955498 0.         0.         0.         0.5925332
  0.         0.38955498 0.23007745 0.2962666  0.38955498 0.23007745
  0.         0.        ]
 [0.         0.         0.3311001  0.         0.43535684 0.
  0.43535684 0.         0.25712876 0.         0.         0.25712876
  0.43535684 0.43535684]]
jieba分词矩阵形状: (3, 14)


# 归一化

In [13]:
# 导入归一化相关的库
import numpy as np

# 创建一个简单的数据矩阵作为示例
print("归一化示例:")
X = np.array([
    [1, -1, 2],
    [2, 0, 0],
    [0, 1, -1],
    [5, 2, 0]
])
print("原始数据矩阵:\n", X)
print("数据矩阵形状:", X.shape)

# MinMaxScaler归一化：将数据缩放到[0,1]区间
min_max_scaler = MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
print("\nMinMaxScaler归一化后的数据(缩放到[0,1]):\n", X_minmax)

# 使用MinMaxScaler(feature_range=(-1, 1)），也可以缩放到其他区间，例如[-1,1]
min_max_scaler_custom = MinMaxScaler(feature_range=(-1, 1))
X_minmax_custom = min_max_scaler_custom.fit_transform(X)
print("MinMaxScaler归一化后的数据(缩放到[-1,1]):\n", X_minmax_custom)



归一化示例:
原始数据矩阵:
 [[ 1 -1  2]
 [ 2  0  0]
 [ 0  1 -1]
 [ 5  2  0]]
数据矩阵形状: (4, 3)

MinMaxScaler归一化后的数据(缩放到[0,1]):
 [[0.2        0.         1.        ]
 [0.4        0.33333333 0.33333333]
 [0.         0.66666667 0.        ]
 [1.         1.         0.33333333]]
MinMaxScaler归一化后的数据(缩放到[-1,1]):
 [[-0.6        -1.          1.        ]
 [-0.2        -0.33333333 -0.33333333]
 [-1.          0.33333333 -1.        ]
 [ 1.          1.         -0.33333333]]


# 标准化

In [14]:
# StandardScaler标准化：将数据转换为均值为0，标准差为1的分布
print("\nStandardScaler标准化示例:")
X_example = np.array([[1., -1., 3.],
                      [2., 4., 2.],
                      [4., 6., -1.]])
print("原始数据矩阵:\n", X_example)

# 应用StandardScaler进行标准化
std_scaler = StandardScaler()
X_std = std_scaler.fit_transform(X_example)
print("StandardScaler标准化后的数据:\n", X_std)

# 查看标准化后的均值和标准差
print("标准化后的均值:", X_std.mean(axis=0)) #无限接近0
print("标准化后的标准差:", X_std.std(axis=0))

# 也可以查看原始数据的均值和标准差
print("\n原始数据的均值:", X_example.mean(axis=0))
print("原始数据的标准差:", X_example.std(axis=0))



StandardScaler标准化示例:
原始数据矩阵:
 [[ 1. -1.  3.]
 [ 2.  4.  2.]
 [ 4.  6. -1.]]
StandardScaler标准化后的数据:
 [[-1.06904497 -1.35873244  0.98058068]
 [-0.26726124  0.33968311  0.39223227]
 [ 1.33630621  1.01904933 -1.37281295]]
标准化后的均值: [-1.48029737e-16  7.40148683e-17  7.40148683e-17]
标准化后的标准差: [1. 1. 1.]

原始数据的均值: [2.33333333 3.         1.33333333]
原始数据的标准差: [1.24721913 2.94392029 1.69967317]


In [15]:
# SimpleImputer用于处理缺失值
print("\nSimpleImputer示例:")
# 创建一个包含缺失值的数据矩阵
X_missing = np.array([
    [np.nan, 2, 3],
    [4, np.nan, 6],
    [7, 8, np.nan],
    [np.nan, np.nan, 6]
])
print("包含缺失值的原始数据矩阵:\n", X_missing)

# missing_values : default=np. nan
# 使用均值策略填充缺失值
mean_imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
X_imputed_mean = mean_imputer.fit_transform(X_missing)
print("\n使用均值填充后的数据矩阵:\n", X_imputed_mean)

# 使用中位数策略填充缺失值
median_imputer = SimpleImputer(strategy='median')
X_imputed_median = median_imputer.fit_transform(X_missing)
print("\n使用中位数填充后的数据矩阵:\n", X_imputed_median)

# 使用常数值填充缺失值
constant_imputer = SimpleImputer(strategy='constant', fill_value=0)
X_imputed_constant = constant_imputer.fit_transform(X_missing)
print("\n使用常数0填充后的数据矩阵:\n", X_imputed_constant)

# 使用最频繁值填充缺失值,众数
most_frequent_imputer = SimpleImputer(strategy='most_frequent')
X_imputed_most_frequent = most_frequent_imputer.fit_transform(X_missing)
print("\n使用最频繁值填充后的数据矩阵:\n", X_imputed_most_frequent)



SimpleImputer示例:
包含缺失值的原始数据矩阵:
 [[nan  2.  3.]
 [ 4. nan  6.]
 [ 7.  8. nan]
 [nan nan  6.]]

使用均值填充后的数据矩阵:
 [[5.5 2.  3. ]
 [4.  5.  6. ]
 [7.  8.  5. ]
 [5.5 5.  6. ]]

使用中位数填充后的数据矩阵:
 [[5.5 2.  3. ]
 [4.  5.  6. ]
 [7.  8.  6. ]
 [5.5 5.  6. ]]

使用常数0填充后的数据矩阵:
 [[0. 2. 3.]
 [4. 0. 6.]
 [7. 8. 0.]
 [0. 0. 6.]]

使用最频繁值填充后的数据矩阵:
 [[4. 2. 3.]
 [4. 2. 6.]
 [7. 8. 6.]
 [4. 2. 6.]]


# 方差阈值

In [45]:
# VarianceThreshold用于特征选择，去除方差低于阈值的特征
print("\nVarianceThreshold示例:")

# 创建一个示例数据矩阵
X_variance = np.array([
    [0, 2, 0, 3],
    [0, 1, 4, 3],
    [0, 1, 1, 3]
])
print("原始数据矩阵:\n", X_variance)

# 计算每个特征的方差
# The default is to compute the variance of the flattened array. 
feature_variances = np.var(X_variance, axis=0)
print("\n各特征的方差:", feature_variances)

# 使用方差阈值为0.8进行特征选择# 这将移除方差小于0.8的特征
selector = VarianceThreshold(threshold=0.8)
X_selected = selector.fit_transform(X_variance)
# 显示保留的特征索引
print("\n保留的特征索引:", selector.get_support(indices=True))
print("保留的特征方差:", feature_variances[selector.get_support()])
print("\n特征选择后的数据矩阵:\n", X_selected)
# 尝试不同的阈值
selector_low = VarianceThreshold(threshold=0.1)
X_selected_low = selector_low.fit_transform(X_variance)
print("\n阈值为0.1时保留的特征索引:", selector_low.get_support(indices=True))
print("阈值为0.1时特征选择后的数据矩阵:\n", X_selected_low)



VarianceThreshold示例:
原始数据矩阵:
 [[0 2 0 3]
 [0 1 4 3]
 [0 1 1 3]]

各特征的方差: [0.         0.22222222 2.88888889 0.        ]

保留的特征索引: [2]
保留的特征方差: [2.88888889]

特征选择后的数据矩阵:
 [[0]
 [4]
 [1]]

阈值为0.1时保留的特征索引: [1 2]
阈值为0.1时特征选择后的数据矩阵:
 [[2 0]
 [1 4]
 [1 1]]


# PCA

In [46]:
# 定义一个理解PCA的示例函数
def understand_pca(data, n_components=None):
    """
    通过示例理解PCA主成分分析
    
    参数:
    data: 输入数据矩阵
    n_components: 要保留的主成分比例，如果是0到1之间的浮点数，表示保留的主成分解释的方差比例。如果是整数，表示保留的列数量。
    
    返回:
    pca模型、转换后的数据、解释方差比例
    """
    import numpy as np
    import matplotlib.pyplot as plt
    
    # 创建PCA模型
    pca = PCA(n_components=n_components)
    
    # 拟合数据并转换
    transformed_data = pca.fit_transform(data)
    
    # 输出原始数据和结果
    print("原始数据:\n", data)
    print("\n数据形状:", data.shape)
    
    # 输出PCA的结果
    print("\nPCA转换后的数据:\n", transformed_data)
    print("\n转换后数据形状:", transformed_data.shape)
    #输出transformed_data的方差
    print("\n转换后数据方差:", np.var(transformed_data, axis=0))
    #输出transformed_data的方差和
    print("\n转换后数据方差和:", np.var(transformed_data, axis=0).sum())
    
    # 主成分解释的方差比例
    print("\n各主成分解释的方差比例:", pca.explained_variance_ratio_)
    print("累计解释的方差比例:", np.sum(pca.explained_variance_ratio_))
    
    
    return pca, transformed_data, pca.explained_variance_ratio_

# 使用上下文中的数据
X = np.array([[2, 8, 4, 5],
              [6, 3, 0, 8],
              [5, 4, 9, 1]])

#输出X的方差
print("\n原始数据方差:", np.var(X, axis=0))
#输出X的方差和
print("\n原始数据方差和:", np.var(X, axis=0).sum())

# 调用函数演示PCA
print("完整PCA示例:")
pca_full, data_full, var_ratio_full = understand_pca(X)

# 降维到2个主成分
print("\n\n降维到2个主成分的PCA示例:")
pca_2d, data_2d, var_ratio_2d = understand_pca(X, n_components=0.9)




原始数据方差: [ 2.88888889  4.66666667 13.55555556  8.22222222]

原始数据方差和: 29.333333333333336
完整PCA示例:
原始数据:
 [[2 8 4 5]
 [6 3 0 8]
 [5 4 9 1]]

数据形状: (3, 4)

PCA转换后的数据:
 [[-9.33473422e-16  3.82970843e+00  3.58835645e-16]
 [-5.74456265e+00 -1.91485422e+00  3.58835645e-16]
 [ 5.74456265e+00 -1.91485422e+00  3.58835645e-16]]

转换后数据形状: (3, 3)

转换后数据方差: [2.20000000e+01 7.33333333e+00 1.62057690e-63]

转换后数据方差和: 29.333333333333332

各主成分解释的方差比例: [7.50000000e-01 2.50000000e-01 4.38964841e-33]
累计解释的方差比例: 1.0


降维到2个主成分的PCA示例:
原始数据:
 [[2 8 4 5]
 [6 3 0 8]
 [5 4 9 1]]

数据形状: (3, 4)

PCA转换后的数据:
 [[-9.33473422e-16  3.82970843e+00]
 [-5.74456265e+00 -1.91485422e+00]
 [ 5.74456265e+00 -1.91485422e+00]]

转换后数据形状: (3, 2)

转换后数据方差: [22.          7.33333333]

转换后数据方差和: 29.333333333333332

各主成分解释的方差比例: [0.75 0.25]
累计解释的方差比例: 1.0
