In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

torch.manual_seed(1)

import json
import os
from itertools import chain

In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [18]:
def prepare_sequence_batch(data ,max_len,word_to_ix, tag_to_ix):
    seqs = [i[0] for i in data]
    tags = [i[1] for i in data]
    #max_len = max([len(seq) for seq in seqs])
    seqs_pad=[]
    tags_pad=[]
    for seq,tag in zip(seqs, tags):
        if len(seq)<max_len:
            seq_pad = list(chain.from_iterable(seq)) + ['[PAD]'] * (max_len-len(seq))
            tag_pad = tag + ['[PAD]'] * (max_len-len(tag))
        else:
            seq_pad = list(chain.from_iterable(seq))[:100]
            tag_pad = tag[:100]
        seqs_pad.append(seq_pad)
        tags_pad.append(tag_pad)
    idxs_pad = torch.tensor([[word_to_ix[w] for w in seq] for seq in seqs_pad], dtype=torch.long)
    tags_pad = torch.tensor([[tag_to_ix[t] for t in tag] for tag in tags_pad], dtype=torch.long)
    return idxs_pad, tags_pad

In [9]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
PAD_TAG = "<PAD>"
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
epoch = 10
bs = 100
max_len = 100
datafile = ""

In [10]:
with open("./People's_Daily/wdata.json") as f:
    data = json.load(f)

train_data = data[:int(0.8*len(data))]
test_data = data[int(0.8*len(data)):int(0.9*len(data))]
valid_data = data[int(0.9*len(data)):]

In [11]:
with open("./People's_Daily/vocab.json") as f:
    vocab = json.load(f)

In [13]:
with open("./People's_Daily/label.json") as f:
    label = json.load(f)
label.append("[PAD]")

In [14]:
word2id = {}
id2word = {}
for i,word in enumerate(vocab):
    word2id[word] = i
    id2word[i] = word

In [15]:
label2id = {}
id2label = {}
for i,lb in enumerate(label):
    label2id[lb] = i
    id2label[i] = lb

In [19]:
idxs_pad, tags_pad = prepare_sequence_batch(train_data,max_len,word2id,label2id)

In [None]:
torch.save((idxs_pad, tags_pad),"./train_data.pt")

In [None]:
idxs_pad, tags_pad = prepare_sequence_batch(test_data,max_len,word2id,label2id)

In [None]:
torch.save((idxs_pad, tags_pad),"./test_data.pt")

In [None]:
idxs_pad, tags_pad = prepare_sequence_batch(valid_data,max_len,word2id,label2id)

In [None]:
torch.save((idxs_pad, tags_pad),"./valid_data.pt")

In [3]:
idxs_pad, tags_pad = torch.load("./People's_Daily/train_data.pt")

tensor([[1940,  339,   91,  ...,    0,    0,    0],
        [ 872, 1535,  872,  ...,    0,    0,    0],
        [4151, 3567, 3580,  ...,    0,    0,    0],
        ...,
        [ 316, 3472, 2893,  ..., 4418, 3285, 1570],
        [ 316, 3472, 1985,  ..., 4239, 4029, 3567],
        [ 316, 3472, 1007,  ...,  316, 3472, 1007]])