In [None]:
#训练word2Vec
import os
import re

import numpy as np
import pandas as pd
import argparse
from gensim.models import word2vec

# 导入日志模块 创建更漂亮的输出消息
import logging

from hanziconv import HanziConv

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)


def load_training_data(filepath):
    #global x,y
    # 把training时需要的data打开
    # 如果是'training_labe|l.txt'，需要获取label，如果是'training_nolabel.txt'，不需要获取label
    df = pd.read_csv(filepath,sep = "\t")
    df.columns = ['sentence1','sentence2','label']
    #display(df.head())
    x1 = df['sentence1'].apply(lambda x: list(x.strip()))
    x2 = df['sentence2'].apply(lambda x: list(x.strip()))
    x1 = list(x1)
    x2 = list(x2)
    y = df["label"]  #二维的list
    y = list(y)   #一维的list
    print(x1[:100])
    return x1,x2, y


def load_testing_data(path='dataset/testData.tsv'):   #约55万
    # 把testing時需要的data讀進來
    df = pd.read_csv("../data/LCQMC_test.csv",sep = "\t",error_bad_lines=False)
    df.columns = ['sentence1','sentence2']
    x1 = df['sentence1'].apply(lambda x: list(x.strip()))
    x2 = df['sentence2'].apply(lambda x: list(x.strip()))
    x1 = list(x1)
    x2 = list(x2)
    return x1,x2

''' 把句子按字分开，中文按字分，英文数字按空格, 大写转小写，繁体转简体'''
def get_word_list(query):
    query = HanziConv.toSimplified(query.strip())
    regEx = re.compile('[\\W]+')#我们可以使用正则表达式来切分句子，切分的规则是除单词，数字外的任意字符串
    res = re.compile(r'([\u4e00-\u9fa5])')#[\u4e00-\u9fa5]中文范围
    sentences = regEx.split(query.lower())
    str_list = []
    for sentence in sentences:
        if res.split(sentence) == None:
            str_list.append(sentence)
        else:
            ret = res.split(sentence)
            str_list.extend(ret)
    return [w for w in str_list if len(w.strip()) > 0]

# path_prefix = "../model/"
def train_word2vec(x):
  # 训练word embedding
  """
  Embedding 层的输入是一个二维整数张量， 形状为（samples，sequence_length），即（样本数，序列长度）
  Embedding 层输出是（samples，sequence_length，embedding_dimensionality） 的三维浮点数张量。
  较短的序列应该用 0 填充，较长的序列应该被截断，保证输入的序列长度是相同的
  """
  model = word2vec.Word2Vec(x,vector_size=300,window=5,min_count=5,workers=1,epochs=10,sg=1) #iter is epochs
  return model

l = os.listdir()
if "w2v_all.model" not in l:  #如果模型还未创建
  print("加载训练数据")
  train_x1,train_x2,train_y = load_training_data("../data/LCQMC_train.csv")
  dev_x1,dev_x2,dev_y = load_training_data("../data/LCQMC_dev.csv")

  print("加载测试数据集")
  test_x1,test_x2 = load_testing_data("../data/LCQMC_test.csv")
  #训练word embedding时可以将带标签和不代标签的都输入进去   因为这个越多越好   与标签无关
  model = train_word2vec(train_x1+train_x2+dev_x1+dev_x2+test_x1+test_x2)
  print("保存模型")
  model.save("models/w2v_all300.model")

加载训练数据


2022-01-19 22:32:10,292 : INFO : collecting all words and their counts
2022-01-19 22:32:10,292 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-01-19 22:32:10,303 : INFO : PROGRESS: at sentence #10000, processed 107311 words, keeping 2875 word types
2022-01-19 22:32:10,315 : INFO : PROGRESS: at sentence #20000, processed 213005 words, keeping 3293 word types
2022-01-19 22:32:10,327 : INFO : PROGRESS: at sentence #30000, processed 319467 words, keeping 3570 word types
2022-01-19 22:32:10,339 : INFO : PROGRESS: at sentence #40000, processed 425997 words, keeping 3749 word types


[['我', '手', '机', '丢', '了', '，', '我', '想', '换', '个', '手', '机'], ['大', '家', '觉', '得', '她', '好', '看', '吗'], ['求', '秋', '色', '之', '空', '漫', '画', '全', '集'], ['晚', '上', '睡', '觉', '带', '着', '耳', '机', '听', '音', '乐', '有', '什', '么', '害', '处', '吗', '？'], ['学', '日', '语', '软', '件', '手', '机', '上', '的'], ['打', '印', '机', '和', '电', '脑', '怎', '样', '连', '接', '，', '该', '如', '何', '设', '置'], ['侠', '盗', '飞', '车', '罪', '恶', '都', '市', '怎', '样', '改', '车'], ['什', '么', '花', '一', '年', '四', '季', '都', '开'], ['看', '图', '猜', '一', '电', '影', '名'], ['这', '上', '面', '写', '的', '是', '什', '么', '？'], ['建', '议', '您', '重', '新', '注', '册', '，', '辛', '苦', '您', '了', '。'], ['小', '草', '有', '什', '么', '的', '特', '点', ',', '可', '以', '象', '征', '什', '么', '?'], ['校', '验', '失', '败', '了', '，'], ['尼', '玛', '什', '么', '意', '思'], ['自', '找', '苦', '吃', '的', '地', '方', '是', '哪', '儿', '？'], ['尾', '号', '4', '位', '多', '少'], ['谢', '文', '东', '能', '在', '哪', '里', '看'], ['新', '概', '念', '英', '语', '第', '二', '册', '练', '习', '册', '4', '1', '课', '答', '案'], ['过', '年

2022-01-19 22:32:10,355 : INFO : PROGRESS: at sentence #50000, processed 532237 words, keeping 3887 word types
2022-01-19 22:32:10,368 : INFO : PROGRESS: at sentence #60000, processed 639394 words, keeping 3997 word types
2022-01-19 22:32:10,380 : INFO : PROGRESS: at sentence #70000, processed 746367 words, keeping 4102 word types
2022-01-19 22:32:10,393 : INFO : PROGRESS: at sentence #80000, processed 852661 words, keeping 4165 word types
2022-01-19 22:32:10,406 : INFO : PROGRESS: at sentence #90000, processed 959552 words, keeping 4247 word types
2022-01-19 22:32:10,418 : INFO : PROGRESS: at sentence #100000, processed 1065777 words, keeping 4312 word types
2022-01-19 22:32:10,431 : INFO : PROGRESS: at sentence #110000, processed 1172764 words, keeping 4371 word types
2022-01-19 22:32:10,443 : INFO : PROGRESS: at sentence #120000, processed 1279441 words, keeping 4411 word types
2022-01-19 22:32:10,456 : INFO : PROGRESS: at sentence #130000, processed 1385826 words, keeping 4456 word

In [None]:
##测试embedding
from gensim.models import Word2Vec
w2v_path = "models/w2v_all300.model"


embedding = Word2Vec.load(w2v_path)
print(type(embedding))
print(embedding)
# print(embedding['i'])
for i,word in enumerate(embedding.wv.key_to_index ):
  print(word)