In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# 示例文本数据
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]
# 对每个文档进行分词并打印
for i, doc in enumerate(corpus):#enumerate()函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标  i是索引，doc是数据
    words = doc.split()  # 使用空格进行简单分词 
    print(f"文档 {i+1} 分词结果: {words}")

# 创建CountVectorizer对象
vectorizer = CountVectorizer()#CountVectorizer是用于将文本数据转换为词频向量的类，词频向量表示每个词在文本中出现的次数

# 拟合并转换文本数据
X = vectorizer.fit_transform(corpus)

# 输出结果
print("词汇表：", vectorizer.get_feature_names_out())
print("转换后的矩阵：\n", X.toarray())#X.toarray()将稀疏矩阵转换为密集矩阵，以便查看具体数值
print('-'*50)
print(X)#X是稀疏矩阵


文档 1 分词结果: ['This', 'is', 'the', 'first', 'document.']
文档 2 分词结果: ['This', 'document', 'is', 'the', 'second', 'document.']
文档 3 分词结果: ['And', 'this', 'is', 'the', 'third', 'one.']
文档 4 分词结果: ['Is', 'this', 'the', 'first', 'document?']
词汇表： ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
转换后的矩阵：
 [[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
--------------------------------------------------
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 21 stored elements and shape (4, 9)>
  Coords	Values
  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 创建TfidfVectorizer对象
tfidf_vectorizer = TfidfVectorizer()#TfidfVectorizer是用于将文本数据转换为TF-IDF特征向量的类，TF-IDF是一种统计方法，用于评估一个词在文档集或语料库中的重要性

# 拟合并转换文本数据
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# 输出结果
print("TF-IDF词汇表：", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF矩阵：\n", X_tfidf.toarray())
print('-'*50)
print(X_tfidf)


TF-IDF词汇表： ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
TF-IDF矩阵：
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
--------------------------------------------------
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21 stored elements and shape (4, 9)>
  Coords	Values
  (0, 8)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 1)	0.6876235979836938
  (1, 5)	0.5386476208856763
  (2, 8)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 6)	0.26710