In [1]:
from google.colab import drive
drive.mount('./mount')
# './mount/My Drive/Colab Notebooks/BH/1-2/DL/3/chinese_corpus/'

Mounted at ./mount


In [2]:
# import

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
import numpy as np

import os
import math
import jieba
from collections import Counter


定义函数

In [3]:

def isfloat(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def readcorpus(folder_path):
    sentences = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r', encoding='utf-8') as file:
            corpus = file.read()

        # by word
        o_word_tokens = jieba.lcut(corpus)

        # remove irrelevant content
        word_tokens = []
        for word_token in o_word_tokens:
            if len(word_token) == 1 or word_token in ['\n', '。', '？', '！', '，', '；', '：'] or word_token.isdigit() or isfloat(word_token):
                continue
            else:
                word_tokens.append(word_token)

        sentences.append(word_tokens)

    return sentences


main

In [4]:

if __name__ == '__main__':
    # Folder path containing text files
    folder_path = './mount/My Drive/Colab Notebooks/BH/1-2/DL/3/chinese_corpus/'

    # Preprocess files
    sentences = readcorpus(folder_path)

    # 训练Word2Vec模型
    model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

    # 打印词汇表，查看是否包含特定词
    vocab = list(model.wv.key_to_index.keys())
    print("Vocabulary size:", len(vocab))
    print("First 100 words in vocabulary:", vocab[:100])
    print("Is '韦小宝' in vocabulary?", '韦小宝' in vocab)
    print("Is '周牧' in vocabulary?", '周牧' in vocab)

    # 验证词向量的有效性
    if '韦小宝' in vocab and '周牧' in vocab:
        # 1. 计算词向量之间的语义距离
        print("Distance between '韦小宝' and '周牧':", model.wv.distance('韦小宝', '周牧'))

    # 2. 某一类词语的聚类
    word_vectors = model.wv
    word_vectors_list = [word_vectors[word] for word in word_vectors.index_to_key]
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(word_vectors_list)
    print("Top terms per cluster:")
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = word_vectors.index_to_key
    for i in range(5):
        print("Cluster %d:" % i)
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind])

    # 3. 某些段落之间的语义关联（这里示例简化为直接相似度计算）
    paragraph1 = "这时丘处机逝世已久"
    paragraph2 = "她一生爱穿白衣"

    # 分词
    paragraph1_tokens = jieba.lcut(paragraph1)
    paragraph2_tokens = jieba.lcut(paragraph2)

    # 计算段落向量
    paragraph1_vector = np.mean([word_vectors[word] for word in paragraph1_tokens if word in word_vectors.key_to_index], axis=0)
    paragraph2_vector = np.mean([word_vectors[word] for word in paragraph2_tokens if word in word_vectors.key_to_index], axis=0)

    # 计算相似度
    similarity = np.dot(paragraph1_vector, paragraph2_vector) / (np.linalg.norm(paragraph1_vector) * np.linalg.norm(paragraph2_vector))
    print("Similarity between paragraph 1 and paragraph 2:", similarity)


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.905 seconds.
DEBUG:jieba:Loading model cost 0.905 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


Vocabulary size: 155321
First 100 words in vocabulary: ['说道', '自己', '韦小宝', '一个', '咱们', '一声', '武功', '什么', '不是', '心中', '甚么', '他们', '师父', '不知', '知道', '出来', '令狐冲', '心想', '只见', '张无忌', '杨过', '我们', '两人', '只是', '突然', '之中', '不敢', '如此', '众人', '弟子', '不能', '如何', '原来', '你们', '二人', '便是', '郭靖', '这里', '兄弟', '起来', '当下', '这时', '身子', '袁承志', '可是', '脸上', '今日', '见到', '姑娘', '身上', '虽然', '问道', '怎么', '两个', '不过', '功夫', '左手', '伸手', '这个', '却是', '还是', '说话', '倘若', '登时', '眼见', '黄蓉', '喝道', '爹爹', '这些', '之后', '当真', '有人', '正是', '出去', '教主', '不可', '长剑', '跟着', '不会', '过去', '就是', '没有', '这么', '声音', '地下', '之下', '性命', '双手', '手中', '右手', '一招', '陈家洛', '皇帝', '一人', '难道', '自然', '这位', '只怕', '一阵', '一般']
Is '韦小宝' in vocabulary? True
Is '周牧' in vocabulary? True
Distance between '韦小宝' and '周牧': 0.05890929698944092




Top terms per cluster:
Cluster 0:
 什么
 双手
 难道
 一招
 说话
 正是
 声音
 之后
 不能
 爹爹
Cluster 1:
 什么
 双手
 难道
 一招
 正是
 声音
 之后
 说话
 爹爹
 不能
Cluster 2:
 什么
 双手
 难道
 一招
 正是
 声音
 之后
 说话
 爹爹
 不能
Cluster 3:
 什么
 双手
 难道
 一招
 正是
 之后
 声音
 说话
 爹爹
 不能
Cluster 4:
 什么
 双手
 难道
 一招
 正是
 声音
 之后
 说话
 爹爹
 不能
Similarity between paragraph 1 and paragraph 2: 0.9848795
