# 数据处理模块

## 目录

* 词向量导入
* 数据集加载
* 构建word2id并pad成相同长度
* 求词向量均值和方差
* 生成词向量
* 生成训练集、验证集和测试集

In [1]:
from torch.utils import data
import os
import random
import numpy as np
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

In [2]:
# 词向量导入
wvmodel = KeyedVectors.load_word2vec_format("../GoogleNews-vectors-negative300.bin.gz",binary=True)
wvmodel.get_vector("good")

array([ 0.04052734,  0.0625    , -0.01745605,  0.07861328,  0.03271484,
       -0.01263428,  0.00964355,  0.12353516, -0.02148438,  0.15234375,
       -0.05834961, -0.10644531,  0.02124023,  0.13574219, -0.13183594,
        0.17675781,  0.27148438,  0.13769531, -0.17382812, -0.14160156,
       -0.03076172,  0.19628906, -0.03295898,  0.125     ,  0.25390625,
        0.12695312, -0.15234375,  0.03198242,  0.01135254, -0.01361084,
       -0.12890625,  0.01019287,  0.23925781, -0.08447266,  0.140625  ,
        0.13085938, -0.04516602,  0.06494141,  0.02539062,  0.05615234,
        0.24609375, -0.20507812,  0.23632812, -0.00860596, -0.02294922,
        0.05078125,  0.10644531, -0.03564453,  0.08740234, -0.05712891,
        0.08496094,  0.23535156, -0.10107422, -0.03564453, -0.04736328,
        0.04736328, -0.14550781, -0.10986328,  0.14746094, -0.23242188,
       -0.07275391,  0.19628906, -0.37890625, -0.07226562,  0.04833984,
        0.11914062,  0.06103516, -0.12109375, -0.27929688,  0.05

In [3]:
# 数据集加载
pos_samples = open("./data/MR/rt-polarity.pos",errors="ignore").readlines()
neg_samples = open("./data/MR/rt-polarity.neg",errors="ignore").readlines()
datas = pos_samples+neg_samples
datas = [data.split() for data in datas]
labels = [1]*len(pos_samples)+[0]*len(neg_samples)
print (len(datas),len(labels))

10662 10662


In [4]:
pos_samples[:5]

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n',
 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . \n',
 'effective but too-tepid biopic\n',
 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . \n',
 "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . \n"]

In [5]:
neg_samples[:5]

['simplistic , silly and tedious . \n',
 "it's so laddish and juvenile , only teenage boys could possibly find it funny . \n",
 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . \n',
 '[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . \n',
 'a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . \n']

In [7]:
np.array(datas[:5])

array([list(['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', "century's", 'new', '"', 'conan', '"', 'and', 'that', "he's", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.']),
       list(['the', 'gorgeously', 'elaborate', 'continuation', 'of', '"', 'the', 'lord', 'of', 'the', 'rings', '"', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co-writer/director', 'peter', "jackson's", 'expanded', 'vision', 'of', 'j', '.', 'r', '.', 'r', '.', "tolkien's", 'middle-earth', '.']),
       list(['effective', 'but', 'too-tepid', 'biopic']),
       list(['if', 'you', 'sometimes', 'like', 'to', 'go', 'to', 'the', 'movies', 'to', 'have', 'fun', ',', 'wasabi', 'is', 'a', 'good', 'place', 'to', 'start', '.']),
       list(['emerges', 'as', 'something', 'rare', ',', 'an', 'issue', 'movie', "that's", 'so', 'honest', 'and', 'keenly

In [8]:
# 构建word2id并pad成相同长度
max_sample_length = max([len(sample) for sample in datas])
word2id = {"<pad>":0}
for i,data in enumerate(datas):
    for j,word in enumerate(data):
        if word2id.get(word)==None:
            word2id[word] = len(word2id)
        datas[i][j] = word2id[word]
    datas[i] = datas[i]+[0]*(max_sample_length-len(datas[i])) #将所有句子pad成max_sample_length的长度
    #datas[i] = datas[i][0:max_sample_length]+[0]*(max_sample_length-len(datas[i]))  #包含截断的写法

In [9]:
max_sample_length

59

In [10]:
datas[0]

[1,
 2,
 3,
 4,
 5,
 6,
 1,
 7,
 8,
 9,
 10,
 11,
 10,
 12,
 13,
 14,
 15,
 5,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [11]:
# 求词向量均值和方差
tmp = []
for word, index in word2id.items():
    try:
        tmp.append(wvmodel.get_vector(word))
    except:
        pass
mean = np.mean(np.array(tmp))
std = np.std(np.array(tmp))
print (mean,std)

-0.001389387 0.17722417


In [12]:
# 生成词向量
vocab_size = len(word2id)
embed_size = 300
#embedding_weights = np.random.normal(-0.0016728516,0.17756976,[vocab_size,embed_size])
embedding_weights = np.random.normal(mean,std,[vocab_size,embed_size])
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = wvmodel.get_vector(word)
    except:
        pass

In [13]:
embedding_weights.shape

(21402, 300)

In [14]:
# 打乱数据集
c = list(zip(datas,labels))
random.seed(1)
random.shuffle(c)
datas[:],labels[:] = zip(*c)

In [15]:
datas[0]

[17,
 86,
 189,
 19748,
 293,
 670,
 9126,
 91,
 1,
 5101,
 35,
 17,
 19749,
 12,
 3834,
 2562,
 2321,
 24,
 640,
 155,
 154,
 168,
 2683,
 35,
 1,
 749,
 3,
 5,
 6,
 19750,
 31,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [16]:
# 生成训练集、验证集和测试集
k = 0
# ；k=3 0,1,2+4-9

In [17]:
train_datas = datas[:int(k * len(datas) / 10)] + datas[int((k + 1) * len(datas) / 10):]
train_labels = labels[:int(k * len(datas) / 10)] + labels[int((k + 1) * len(labels) / 10):]


In [18]:
valid_datas = np.array(train_datas[int(0.9 * len(train_datas)):])
valid_labels = np.array(train_labels[int(0.9 * len(train_labels)):])

In [19]:
print (valid_datas.shape,valid_labels.shape)

(960, 59) (960,)


In [20]:
train_datas = np.array(train_datas[0:int(0.9*len(train_datas))])
train_labels = np.array(train_labels[0:int(0.9*len(train_labels))])

In [21]:
print (train_datas.shape,train_labels.shape)

(8636, 59) (8636,)


In [22]:
test_datas = np.array(datas[int(k * len(datas) / 10):int((k + 1) * len(datas) / 10)])
test_labels = np.array(labels[int(k * len(datas) / 10):int((k + 1) * len(datas) / 10)])

In [23]:
print (test_datas.shape,test_labels.shape)

(1066, 59) (1066,)
