In [1]:
import gensim
import json
import numpy as np
import pandas as pd
import torch
from torch import nn


In [2]:
def get_data(file_path):
    with open(file_path,'r') as f:
        data = json.load(f)
    return data
def get_words(data):
    return [i['words'] for i in data]
def word_to_numpy(words_list):
    res = []
    for i in words_list:
        res.append(np.array(i))
    return res
def seq_pad(sent_list,seq_size=150):
    # 扩展序列
    res_list = []
    for sent in sent_list:
        d = len(sent)
        a = seq_size // len(sent)
        b = seq_size % len(sent)
        # if a == 0:
        #     # b肯定不为0
        #     # seq里面随机取
        #     temp_seq = np.random.choice(sent,seq_size-b)
        #     sent = np.concatenate([sent,temp_seq],axis=0)
        #     print(sent.shape)
        # else:
        temp_seq1 = np.tile(sent,a)
        x1 = len(temp_seq1)
        # print(len(temp_seq1))
        temp_seq2 = np.random.choice(sent,b)
        x2 = len(temp_seq2)

        sent = np.concatenate([temp_seq1,temp_seq2],axis=0)
        z = len(sent)
        res_list.append(sent)
        # continue
    return np.stack(res_list)
def get_w2v_model(words_list):
    w2v_model = gensim.models.Word2Vec(words_list,vector_size=100,min_count=1,sg=1)
    return w2v_model
def word_to_vec(words_np,w2v_model):
    words_list = [list(i) for i in words_np]
    res = []
    for words in words_list:
        temp = (np.array([w2v_model.wv[word] for word in words])*400).astype(np.int32)
        res.append(temp)
    return res

In [3]:
def get_word_dict(file_path):
    word_to_ix = get_data(file_path)
    word_dict = dict()
    for key,val in word_to_ix.items():
        word_dict[val] = key
    return word_dict
def word_to_num(words_list,word_to_ix:dict):
    res_seq = []
    for words in words_list:
        temp_list = np.array([word_to_ix[i] for i in words])
        res_seq.append(temp_list)
    return res_seq
def get_batch(_data,_label,batch_size):
    # data (n*seq)
    data = torch.from_numpy(_data)
    label = torch.from_numpy(_label)
    a = len(data) // batch_size
    res_data = []
    res_label = []
    for i in range(a-1):
        temp = data[i*batch_size:(i+1)*batch_size].unsqueeze(-1).permute(1,0,2)
        res_data.append(temp)
    res_data.append(data[-batch_size:].unsqueeze(-1).permute(1,0,2))
    for i in range(a-1):
        res_label.append(label[i*batch_size:(i+1)*batch_size].unsqueeze(-1).float())
    res_label.append(label[-batch_size:].unsqueeze(-1).float())
    return res_data,res_label
def get_labels(data):
    return np.array([i['label'] for i in data])

In [4]:
file_path = '../data/data.json'
data = get_data(file_path)
labels = get_labels(data)
words_list = get_words(data)
word_to_ix = get_data('../data/word_to_ix.json')
# word_np = word_to_numpy(words_list)
word_num = word_to_num(words_list,word_to_ix)
word_num_padded = seq_pad(word_num)
w2v_model = get_w2v_model(words_list)
# word_vector = word_to_vec(word_np,w2v_model)
# word_vector
word_dict = get_word_dict('../data/word_to_ix.json')
word_dict
data_set = get_batch(word_num_padded,labels,batch_size=5)
data_set

([tensor([[[ 0],
           [19],
           [24],
           [39],
           [24]],
  
          [[ 1],
           [20],
           [25],
           [ 9],
           [25]],
  
          [[ 2],
           [21],
           [26],
           [20],
           [26]],
  
          [[ 3],
           [ 9],
           [27],
           [40],
           [27]],
  
          [[ 4],
           [16],
           [28],
           [41],
           [28]],
  
          [[ 5],
           [22],
           [29],
           [42],
           [29]],
  
          [[ 6],
           [23],
           [30],
           [43],
           [30]],
  
          [[ 7],
           [19],
           [24],
           [44],
           [24]],
  
          [[ 8],
           [20],
           [31],
           [39],
           [45]],
  
          [[ 9],
           [21],
           [32],
           [ 9],
           [32]],
  
          [[10],
           [ 9],
           [33],
           [20],
           [33]],
  
          [[11],
    

In [12]:
class lstm(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,w2v_model,words_dict):
        """
        # vocab_size = 2000
        # embedding_dim = 100
        """
        super(lstm, self).__init__()
        
        # [0-10001] => [100]
        # 词嵌入
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.words_dict = words_dict
        self.w2v_model = w2v_model
        # [100] => [256]
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 
                           bidirectional=True, dropout=0.5)
        # [256*2] => [1]
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()
    # def embedding2(self,x:torch.Tensor):
    #     res = []
    #     # x_ [b,seq_len]
    #     x_ = x.squeeze(-1).permute([1,0])
    #     batch = []
    #     for seq in x_:
    #         word_list = [self.words_dict[i] for i in seq.tolist()]
    #         # [seq_len,100]
    #         seq_vector = np.array([w2v_model.wv[word] for word in word_list])
    #         batch.append(seq_vector)
    #     batch = np.stack(batch)
    #     # [batch,seq_len,100]
    #     batch = torch.from_numpy(batch).permute([1,0,2])
    #     return batch
        
    def forward(self,x):
        """
        x: [seq_len, b] vs [b, 3, 28, 28]
        """
        # [seq, b, 1] => [seq, b, 100]
        # embedding = self.dropout(self.embedding(x).squeeze(2))
        embedding = self.embedding(x)

        
        # output: [seq, b, hid_dim*2]
        # hidden/h: [num_layers*2, b, hid_dim]
        # cell/c: [num_layers*2, b, hid_di]
        output, (hidden, cell) = self.rnn(embedding)
        
        # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        # [b, hid_dim*2] => [b, 1]
        hidden = self.dropout(hidden)
        s = self.fc(hidden)
        out = self.sigmoid(s)
        return out


In [17]:
def get_all_word(word_to_ix:dict):
    words_lsit = list(word_to_ix.keys())
    return words_list

In [21]:
def load_pretrained_embedding(word_dict,w2v_model,embedding_dim=100):
    word_list = list(word_dict.keys())
    size = len(word_list)
    emb = torch.zeros([size,embedding_dim])
    for i,word in enumerate(word_list):
        vec = w2v_model.wv[word]
        emb[i,:] = torch.from_numpy(vec)
    return emb


In [22]:
model = lstm(2000,100,1,w2v_model,word_dict)
# model.embedding.weight.copy_(load_pretrained_embedding(words_list,w2v_model))
pre_emb = load_pretrained_embedding(word_dict,w2v_model)

In [23]:
pre_emb.size()

torch.Size([2858, 100])

In [1]:
import json
with open('../data/data.json','r') as f:
    data = json.load(f)
len(data)

1116