## 次频矩阵

In [19]:
good_words = [
        "Dr.Liu always smlies, like a Angel.",
        "Dr.Liu always encourages us.",
        "Dr.Liu study harduous."
]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(good_words)

CountVectorizer()

In [12]:
format(vect.vocabulary_)

"{'dr': 1, 'liu': 4, 'always': 0, 'smlies': 5, 'encourages': 2, 'us': 7, 'study': 6, 'harduous': 3}"

In [5]:
bag_of_words = vect.transform(good_words)

In [6]:
print("Dense representation of bag_of_words:\n{}".format(
    bag_of_words.toarray()))

Dense representation of bag_of_words:
[[1 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 1 0 1 0]
 [0 1 0 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1]
 [0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 1 0 0 1 0 1]]


In [None]:
## 词向量

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

In [22]:
# 第一步：使用 Keras 的 Tokenizer 进行词频统计
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(good_words)

In [23]:
# 打印词典
word_index = tokenizer.word_index
print("Word Index:\n", word_index)

Word Index:
 {'dr': 1, 'liu': 2, 'always': 3, 'smlies': 4, 'like': 5, 'a': 6, 'angel': 7, 'encourages': 8, 'us': 9, 'study': 10, 'harduous': 11}


In [24]:
# 第二步：将文本转换为序列
X_seq = tokenizer.texts_to_sequences(good_words)
for i, seq in enumerate(X_seq):
    print(f"Text {i + 1}: {seq}")

Text 1: [1, 2, 3, 4, 5, 6, 7]
Text 2: [1, 2, 3, 8, 9]
Text 3: [1, 2, 10, 11]


In [25]:
# 第三步：填充序列
X_padded = pad_sequences(X_seq, maxlen=10)
for i, seq in enumerate(X_padded):
    print(f"Text {i + 1}: {seq}")

Text 1: [0 0 0 1 2 3 4 5 6 7]
Text 2: [0 0 0 0 0 1 2 3 8 9]
Text 3: [ 0  0  0  0  0  0  1  2 10 11]


In [26]:
# 第四步：创建词向量
# 这里定义一个简单的词向量模型
embedding_dim = 8  # 设定词向量的维度
vocab_size = len(word_index) + 1  # 加一是为了考虑0索引

# 创建Embedding层
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)

# 第五步：将填充的序列输入Embedding层以获取词向量
word_vectors = embedding_layer(X_padded)
print("\nWord Vectors:\n", word_vectors.numpy())


Word Vectors:
 [[[ 0.01336945 -0.011541    0.03951642 -0.00440258 -0.04758036
    0.02876765  0.02557026  0.025446  ]
  [ 0.01336945 -0.011541    0.03951642 -0.00440258 -0.04758036
    0.02876765  0.02557026  0.025446  ]
  [ 0.01336945 -0.011541    0.03951642 -0.00440258 -0.04758036
    0.02876765  0.02557026  0.025446  ]
  [-0.00128492  0.01469815  0.03787588  0.0033685   0.00185642
    0.03059173  0.00985556  0.02063062]
  [-0.0035697   0.01458437  0.00620545 -0.03541188 -0.00327229
    0.00602468  0.02485589 -0.03000002]
  [-0.0206798  -0.00884556  0.03863028  0.03695312 -0.02044791
    0.04315678 -0.01211143  0.04339038]
  [-0.02693781  0.00016207  0.00751741 -0.03176061 -0.00227679
   -0.0307631   0.00023896 -0.04539588]
  [-0.01712164  0.00401653 -0.00774818  0.00575099  0.0158647
    0.00116154  0.02828337  0.04447753]
  [-0.04150128 -0.02030344 -0.00447232 -0.01492323  0.01567868
   -0.01110536 -0.00017142  0.0133341 ]
  [-0.02256172  0.00382816  0.03481703 -0.01615246  0.0074

词向量将词转换为低维的数值表示，能够捕捉词之间的语义关系。例如，“国王”与“王后”的词向量距离较近，反映了它们的相似性。

In [16]:
import numpy as np
from gensim.models import Word2Vec

In [17]:
good_words = [
    "Liu Laoshi plays Honor of Kings like a pro.",
    "Liu Laoshi is an inspiring teacher who motivates every student.",
    "Every lesson with Liu Laoshi feels insightful and engaging.",
    "Liu Laoshi explains complex topics with clarity and patience.",
    "Liu Laoshi provides excellent feedback, helping us improve constantly."
]

# 将句子拆分成词
sentences = [sentence.lower().split() for sentence in good_words]

In [18]:
# 训练Word2Vec模型
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [19]:
# 打印每个词的词向量
for word in model.wv.key_to_index:
    print(f"Word: {word}, Vector: {model.wv[word]}")

Word: liu, Vector: [-5.4152746e-04  2.4861563e-04  5.1052310e-03  9.0103801e-03
 -9.2982231e-03 -7.1136281e-03  6.4664804e-03  8.9803375e-03
 -5.0147907e-03 -3.7611199e-03  7.3831533e-03 -1.5409441e-03
 -4.5366776e-03  6.5523218e-03 -4.8643127e-03 -1.8112848e-03
  2.8807358e-03  9.8713813e-04 -8.2992222e-03 -9.4646905e-03
  7.3194434e-03  5.0821393e-03  6.7641940e-03  7.6786842e-04
  6.3343514e-03 -3.3976692e-03 -9.4715215e-04  5.7539861e-03
 -7.5227031e-03 -3.9408500e-03 -7.5083165e-03 -9.3771110e-04
  9.5429718e-03 -7.3200152e-03 -2.3362017e-03 -1.9355882e-03
  8.0793714e-03 -5.9339995e-03  3.3638858e-05 -4.7603222e-03
 -9.6035441e-03  5.0112470e-03 -8.7615205e-03 -4.3904181e-03
 -3.3265238e-05 -2.9940644e-04 -7.6628155e-03  9.6026901e-03
  4.9884995e-03  9.2392452e-03 -8.1669223e-03  4.4845799e-03
 -4.1428441e-03  8.2110753e-04  8.5058417e-03 -4.4635222e-03
  4.5257551e-03 -6.7900373e-03 -3.5461441e-03  9.3959663e-03
 -1.5791378e-03  3.1225846e-04 -4.1300301e-03 -7.6802974e-03
 -1.5