# 数据处理模块

## 目录

* 词向量导入
* 数据集加载
* 构建word2id并pad成相同长度
* 求词向量均值和方差
* 生成词向量
* 生成训练集、验证集和测试集

In [1]:
from torch.utils import data
import os
import random
import numpy as np
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

In [2]:
# 词向量导入
wvmodel = KeyedVectors.load_word2vec_format("../GoogleNews-vectors-negative300.bin.gz",binary=True)
wvmodel.get_vector("good")

FileNotFoundError: [Errno 2] No such file or directory: '../GoogleNews-vectors-negative300.bin.gz'

In [3]:
# 数据集加载
pos_samples = open("./data/MR/rt-polarity.pos",errors="ignore").readlines()
neg_samples = open("./data/MR/rt-polarity.neg",errors="ignore").readlines()
datas = pos_samples+neg_samples
datas = [data.split() for data in datas]
labels = [1]*len(pos_samples)+[0]*len(neg_samples)
print (len(datas),len(labels))

10662 10662


In [4]:
pos_samples[:5]

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n',
 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . \n',
 'effective but too-tepid biopic\n',
 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . \n',
 "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . \n"]

In [5]:
neg_samples[:5]

['simplistic , silly and tedious . \n',
 "it's so laddish and juvenile , only teenage boys could possibly find it funny . \n",
 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . \n',
 '[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . \n',
 'a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . \n']

In [6]:
np.array(datas[:5])

  np.array(datas[:5])


array([list(['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', "century's", 'new', '"', 'conan', '"', 'and', 'that', "he's", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.']),
       list(['the', 'gorgeously', 'elaborate', 'continuation', 'of', '"', 'the', 'lord', 'of', 'the', 'rings', '"', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co-writer/director', 'peter', "jackson's", 'expanded', 'vision', 'of', 'j', '.', 'r', '.', 'r', '.', "tolkien's", 'middle-earth', '.']),
       list(['effective', 'but', 'too-tepid', 'biopic']),
       list(['if', 'you', 'sometimes', 'like', 'to', 'go', 'to', 'the', 'movies', 'to', 'have', 'fun', ',', 'wasabi', 'is', 'a', 'good', 'place', 'to', 'start', '.']),
       list(['emerges', 'as', 'something', 'rare', ',', 'an', 'issue', 'movie', "that's", 'so', 'honest', 'and', 'keenly

In [7]:
# 构建word2id并pad成相同长度
max_sample_length = max([len(sample) for sample in datas])
word2id = {"<pad>":0}
for i,data in enumerate(datas):
    for j,word in enumerate(data):
        if word2id.get(word)==None:
            word2id[word] = len(word2id)
        datas[i][j] = word2id[word]
    datas[i] = datas[i]+[0]*(max_sample_length-len(datas[i])) #将所有句子pad成max_sample_length的长度
    #datas[i] = datas[i][0:max_sample_length]+[0]*(max_sample_length-len(datas[i]))  #包含截断的写法

In [8]:
max_sample_length

59

In [9]:
datas[0]

[1,
 2,
 3,
 4,
 5,
 6,
 1,
 7,
 8,
 9,
 10,
 11,
 10,
 12,
 13,
 14,
 15,
 5,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [10]:
# 求词向量均值和方差
tmp = []
for word, index in word2id.items():
    try:
        tmp.append(wvmodel.get_vector(word))
    except:
        pass
mean = np.mean(np.array(tmp))
std = np.std(np.array(tmp))
print (mean,std)

nan nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [11]:
# 生成词向量
vocab_size = len(word2id)
embed_size = 300
#embedding_weights = np.random.normal(-0.0016728516,0.17756976,[vocab_size,embed_size])
embedding_weights = np.random.normal(mean,std,[vocab_size,embed_size])
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = wvmodel.get_vector(word)
    except:
        pass

In [12]:
embedding_weights.shape

(21384, 300)

In [13]:
# 打乱数据集
c = list(zip(datas,labels))
random.seed(1)
random.shuffle(c)
datas[:],labels[:] = zip(*c)

In [14]:
datas[0]

[17,
 86,
 189,
 19734,
 293,
 670,
 9120,
 91,
 1,
 5099,
 35,
 17,
 19735,
 12,
 3833,
 2562,
 2321,
 24,
 640,
 155,
 154,
 168,
 2683,
 35,
 1,
 749,
 3,
 5,
 6,
 19736,
 31,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [15]:
# 生成训练集、验证集和测试集
k = 0
# ；k=3 0,1,2+4-9

In [16]:
train_datas = datas[:int(k * len(datas) / 10)] + datas[int((k + 1) * len(datas) / 10):]
train_labels = labels[:int(k * len(datas) / 10)] + labels[int((k + 1) * len(labels) / 10):]


In [17]:
valid_datas = np.array(train_datas[int(0.9 * len(train_datas)):])
valid_labels = np.array(train_labels[int(0.9 * len(train_labels)):])

In [18]:
print (valid_datas.shape,valid_labels.shape)

(960, 59) (960,)


In [19]:
train_datas = np.array(train_datas[0:int(0.9*len(train_datas))])
train_labels = np.array(train_labels[0:int(0.9*len(train_labels))])

In [20]:
print (train_datas.shape,train_labels.shape)

(8636, 59) (8636,)


In [21]:
test_datas = np.array(datas[int(k * len(datas) / 10):int((k + 1) * len(datas) / 10)])
test_labels = np.array(labels[int(k * len(datas) / 10):int((k + 1) * len(datas) / 10)])

In [22]:
print (test_datas.shape,test_labels.shape)

(1066, 59) (1066,)
