In [1]:
# 自动加载修改过的文件
%load_ext autoreload
%autoreload 2
import os
import sys
import warnings
sys.path.append('../')  # |返回notebook的上一级目录
warnings.filterwarnings("ignore")

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


In [2]:
from utils.config_gpu import config_gpu
config_gpu()

1 Physical GPUs, 1 Logical GPUs


In [3]:
from utils.config import *
from utils.saveLoader import load_text, get_text, load_dataset

In [4]:
# train_seg, test_seg = load_dataset(TRAIN_SEG, TEST_SEG)
# proc_text = load_text(PROC_TEXT)
# raw_text = load_text(RAW_TEXT)

# **1.损失函数的改进**
- 原本return tf.reduce_mean(loss_)改成tf.reduce_sum(loss_)/tf.reduce_sum(mask)
- 改成了coverage loss
- 

In [5]:
from utils.config import *
from utils.saveLoader import load_train_dataset, load_vocab
import numpy as np
import tensorflow as tf

In [6]:
# 载入数据
train_x,train_y,test_x = load_train_dataset()  # 数据集
vocab,vocab_reversed = load_vocab(VOCAB_PAD)  # vocab
embedding_matrix = np.loadtxt(EMBEDDING_MATRIX_PAD)  # 预训练层

## **1.1原损失函数**

In [7]:
optimizer = tf.keras.optimizers.Adam(name='Adam',learning_rate=0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

pad_index=vocab['<PAD>']
unk_index=vocab['<UNK>']

def loss_function(real, pred):
    pad_mask = tf.math.equal(real, pad_index)
    unk_mask = tf.math.equal(real, unk_index)
    # <PAD> 和 <UNK> 的损失都不算
    mask = tf.math.logical_not(tf.math.logical_or(pad_mask,unk_mask))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    #return tf.reduce_mean(loss_)
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

## **1.2模型测试**

In [8]:
params = {}
params["vocab_size"] = len(vocab)
params["max_enc_len"] = train_x.shape[1]  # 260
params["max_dec_len"] = train_y.shape[1]  # 33
params["embed_size"] = embedding_matrix.shape[1]
params["enc_units"] = 256
params["attn_units"] = 10
params["dec_units"] = params["enc_units"]
params["batch_size"] = 32
params["epochs"] = 2
print(params)

{'vocab_size': 32247, 'max_enc_len': 460, 'max_dec_len': 52, 'embed_size': 300, 'enc_units': 256, 'attn_units': 10, 'dec_units': 256, 'batch_size': 32, 'epochs': 2}


### 创建数据集

In [9]:
sample_num = train_x.shape[0]
dataset = tf.data.Dataset.from_tensor_slices((train_x[:sample_num], train_y[:sample_num])).shuffle(params["batch_size"]*2+1)
dataset = dataset.batch(params["batch_size"], drop_remainder=True)
steps_per_epoch = sample_num//params["batch_size"]

In [10]:
inp, targ = next(iter(dataset))

In [11]:
# input shape (batch_size, enc_len)
# targ shape (batch_size, dec_len)
inp.shape, targ.shape

(TensorShape([32, 460]), TensorShape([32, 52]))

### 进行encoder的运算

In [12]:
from seq2seq import Seq2Seq
model = Seq2Seq(params)

In [13]:
enc_output, enc_hidden = model.call_encoder(inp)
# enc_output shape (batch_size, enc_len, enc_unit) 所有时间步的输出
# enc_hidden shape (batch_size, enc_unit) 最后一个时间步的输出
enc_output.shape, enc_hidden.shape

(TensorShape([32, 460, 256]), TensorShape([32, 256]))

In [16]:
dec_hidden = enc_hidden

### 生成第一个decoder的输入

In [17]:
# dec_input shape (batch_size, 1) 一个词一个词输入decoder
dec_input = tf.expand_dims([vocab['<START>']] * params["batch_size"], 1)
dec_input.shape

TensorShape([32, 1])

### 使用模型预测

In [21]:
# predictions shape (batch_size, dec_len - 1, vocab_size)
# dec_hidden shape (batch_size, dec_unit) same as enc_hidden
# attentions 51个元素的列表 每个元素shape[32, 460, 1]
predictions, dec_hidden, attentions = model(dec_input, dec_hidden, enc_output, targ)
predictions.shape, dec_hidden.shape, len(attentions), attentions[0].shape

(TensorShape([32, 51, 32247]),
 TensorShape([32, 256]),
 51,
 TensorShape([32, 460, 1]))

### ps:模拟下model.call()关于注意力的操作

In [20]:
batch_loss = loss_function(targ[:, 1:], predictions)
batch_loss

<tf.Tensor: id=9541, shape=(), dtype=float32, numpy=10.380057>

## **测试下损失函数加深理解**

In [33]:
real = targ[:, 1:]  # shape (32, 51) (batch_size, dec_len - 1)
pred = predictions  # shape (32, 51, 32247) (batch_size, dec_len - 1, vocab_size)

### 创建mask

In [34]:
pad_index=vocab['<PAD>']
unk_index=vocab['<UNK>']
pad_mask = tf.math.equal(real, pad_index)
unk_mask = tf.math.equal(real, unk_index)
# shape (32, 51) (batch_size, dec_len - 1)
mask = tf.math.logical_not(tf.math.logical_or(pad_mask,unk_mask))
mask

<tf.Tensor: id=27830, shape=(32, 51), dtype=bool, numpy=
array([[ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False]])>

### 计算损失

In [35]:
loss_ = loss_object(real, pred)  # shape (32, 51) (batch_size, dec_len - 1)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask # 对应位置相乘
loss_

<tf.Tensor: id=27863, shape=(32, 51), dtype=float32, numpy=
array([[10.381944, 10.381646, 10.386802, ...,  0.      ,  0.      ,
         0.      ],
       [10.382557, 10.391157, 10.363414, ...,  0.      ,  0.      ,
         0.      ],
       [10.385499, 10.38046 , 10.399067, ...,  0.      ,  0.      ,
         0.      ],
       ...,
       [10.366598, 10.37426 , 10.393599, ...,  0.      ,  0.      ,
         0.      ],
       [10.368627, 10.373091, 10.378149, ...,  0.      ,  0.      ,
         0.      ],
       [10.366828, 10.375762, 10.384366, ...,  0.      ,  0.      ,
         0.      ]], dtype=float32)>

### 返回值

In [24]:
tf.reduce_sum(loss_)/tf.reduce_sum(mask)

<tf.Tensor: id=9589, shape=(), dtype=float32, numpy=10.380057>

## **加入coverage loss**

In [49]:
optimizer = tf.keras.optimizers.Adam(name='Adam',learning_rate=0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

pad_index=vocab['<PAD>']
unk_index=vocab['<UNK>']

def loss_function(real, pred):
    pad_mask = tf.math.equal(real, pad_index)
    unk_mask = tf.math.equal(real, unk_index)
    # <PAD> 和 <UNK> 的损失都不算
    mask = tf.math.logical_not(tf.math.logical_or(pad_mask,unk_mask))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    #return tf.reduce_mean(loss_)
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def coverage_loss_function(real, pred, attn_dists):
    # 先计算原本的损失
    pad_mask = tf.math.equal(real, pad_index)
    unk_mask = tf.math.equal(real, unk_index)
    # <PAD> 和 <UNK> 的损失都不算
    mask = tf.math.logical_not(tf.math.logical_or(pad_mask,unk_mask))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
        
    coverage = tf.zeros_like(attn_dists[0]) # shape (batch_size, attn_length). Initial coverage is zero.
    #covlosses = [] # Coverage loss per decoder timestep. Will be list length max_dec_steps containing shape (batch_size).
    covlosses = []
    for i,a in enumerate(attn_dists):
        covloss = tf.reduce_sum(tf.minimum(a, coverage), [1]) # calculate the coverage loss for this step
        covlosses.append(covloss)
        coverage += a # update the coverage vector
    # coverage_loss = _mask_and_avg(covlosses)
    
    coverage_loss = 0
    for i,_ in enumerate(covlosses):
        covlosses[i] = covlosses[i] * tf.expand_dims(mask[:, i],1)
        coverage_loss += tf.reduce_sum(covlosses[i])

    # print("coverage loss", (coverage_loss/ tf.reduce_sum(mask)).numpy())
    
    return (tf.reduce_sum(loss_)+coverage_loss) / tf.reduce_sum(mask)

@tf.function
def _coverage_loss(attn_dists):
    """
    Calculates the coverage loss from the attention distributions.
    
    attn_dists shape (enc_len, batch_size, dec_len)
    padding_mask 掩码操作

    return: coverage_loss shape 
    """
    coverage = tf.zeros_like(attn_dists[0]) # shape (batch_size, attn_length). Initial coverage is zero.
    #covlosses = [] # Coverage loss per decoder timestep. Will be list length max_dec_steps containing shape (batch_size).
    covlosses = []
    for i,a in enumerate(attn_dists):
        covloss = tf.reduce_sum(tf.minimum(a, coverage), [1]) # calculate the coverage loss for this step
        covlosses.append(covloss)
        coverage += a # update the coverage vector
    # coverage_loss = _mask_and_avg(covlosses)
    
    coverage_loss = 0
    for i,_ in enumerate(covlosses):
        covlosses[i] = covlosses[i] * tf.expand_dims(mask[:, i],1)
        coverage_loss += tf.reduce_sum(covlosses[i])
    
    return coverage_loss

In [50]:
loss_function(targ[:,1:], predictions)

<tf.Tensor: id=28198, shape=(), dtype=float32, numpy=10.381315>

In [51]:
coverage_loss_function(targ[:,1:], predictions, attentions)

<tf.Tensor: id=28963, shape=(), dtype=float32, numpy=11.344197>

In [27]:
attn_dists = attentions
coverage = tf.zeros_like(attn_dists[0])
# coverage_

In [41]:
covlosses = []
for i,a in enumerate(attn_dists):
    covloss = tf.reduce_sum(tf.minimum(a, coverage), [1]) # calculate the coverage loss for this step
    covlosses.append(covloss)
    coverage += a

In [42]:
covlosses[0] = covlosses[0] * tf.expand_dims(mask[:, 0],1)

原始loss 10.380057
coverage loss 0.9570429


<tf.Tensor: id=10516, shape=(), dtype=float32, numpy=11.3371>

In [32]:
coverage = tf.zeros_like(attentions[0])
a = attentions[0]
covloss = tf.reduce_sum(tf.minimum(a, coverage), [1]) 
print(a.shape, coverage.shape)
covloss

(32, 460) (32, 460)


<tf.Tensor: id=10952, shape=(32,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

In [33]:
covlosses = []
for a in attentions:
    covloss = tf.reduce_sum(tf.minimum(a, coverage), [1]) # calculate the coverage loss for this step
    covlosses.append(covloss)
    coverage += a # update the coverage vector

In [35]:
# for i,loss in enumerate(covlosses):
#     covlosses[i] = covlosses[i] + 1
#     #print(loss)

In [None]:
tt = tf.constant([1,2,3,4,5,6,7,8], shape=(4,2,1))
tt

In [None]:
tf.reshape(tt, tt.shape[:2])

In [None]:
tt[0] *= 2

In [37]:
help(tf.accumulate_n)

AttributeError: module 'tensorflow' has no attribute 'accumulate_n'

In [4]:
from utils.params import get_params

In [14]:
from utils.params import get_params
params = get_params()

usage: ipykernel_launcher.py [-h] [--mode MODE] [--max_enc_len MAX_ENC_LEN]
                             [--max_dec_len MAX_DEC_LEN]
                             [--batch_size BATCH_SIZE] [--epochs EPOCHS]
                             [--vocab_path VOCAB_PATH]
                             [--learning_rate LEARNING_RATE]
                             [--adagrad_init_acc ADAGRAD_INIT_ACC]
                             [--max_grad_norm MAX_GRAD_NORM]
                             [--vocab_size VOCAB_SIZE] [--beam_size BEAM_SIZE]
                             [--embed_size EMBED_SIZE] [--enc_units ENC_UNITS]
                             [--dec_units DEC_UNITS] [--attn_units ATTN_UNITS]
                             [--train_seg_x_dir TRAIN_SEG_X_DIR]
                             [--train_seg_y_dir TRAIN_SEG_Y_DIR]
                             [--test_seg_x_dir TEST_SEG_X_DIR]
                             [--checkpoints_save_steps CHECKPOINTS_SAVE_STEPS]
                             [--min_dec_s

SystemExit: 2

In [18]:
%run tt.py

<class 'dict'>


In [6]:
%tb

SystemExit: 2

AttributeError: 'dict' object has no attribute 'type'

In [17]:
params

NameError: name 'params' is not defined

In [19]:
params

{'mode': 'train',
 'max_enc_len': 200,
 'max_dec_len': 41,
 'batch_size': 64,
 'epochs': 10,
 'vocab_path': 'E:\\GitHub\\QA-abstract-and-reasoning\\data\\wv\\vocab_index_pad.txt',
 'learning_rate': 100000.0,
 'adagrad_init_acc': 0.1,
 'max_grad_norm': 0.8,
 'vocab_size': 31820,
 'beam_size': 3,
 'embed_size': 500,
 'enc_units': 512,
 'dec_units': 512,
 'attn_units': 256,
 'train_seg_x_dir': 'E:\\GitHub\\QA-abstract-and-reasoning\\data\\train_x.npy',
 'train_seg_y_dir': 'E:\\GitHub\\QA-abstract-and-reasoning\\data\\train_y.npy',
 'test_seg_x_dir': 'E:\\GitHub\\QA-abstract-and-reasoning\\data\\test_x.npy',
 'checkpoints_save_steps': 5,
 'min_dec_steps': 4,
 'max_train_steps': 1250,
 'train_pickle_dir': '/opt/kaikeba/dataset/',
 'save_batch_train_data': False,
 'load_batch_train_data': False}

In [None]:
from utils.config import *
from utils.saveLoader import load_train_dataset

In [25]:
import numpy as np
train_x = np.savetxt(TRAIN_X)
train_y = np.savetxt(TRAIN_Y)
test_x = np.savetxt(TEST_X)

In [42]:
np.savetxt(TRAIN_X, train_x, fmt="%d", delimiter=",")
np.savetxt(TRAIN_Y, train_y, fmt="%d", delimiter=",")
np.savetxt(TEST_X, test_x, fmt="%d", delimiter=",")

In [43]:
x,y,z = load_train_dataset()

ValueError: could not convert string to float: '32243,510,1053,0,338,316,0,510,20,28,3,42,118,149,1,20,28,3,338,316,0,510,28,3,510,574,1,6,6,338,29,1053,0,622,46,183,47,11,21,12,62,4,30,1,697,6,10,96,562,3,1825,271,0,9,490,1053,1,102,2,4,752,1053,14,4,1,0,26375,7,523,17,4,0,772,413,4,292,39,458,1053,0,1933,47,4969,316,28,3,169,0,132,51,475,685,5459,3,0,56,90,98,9,11,123,1113,34,413,2230,2,32244,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245,32245'

In [37]:
train_y

array([[32243,   411,   483, ..., 32245, 32245, 32245],
       [32243,   411,   483, ..., 32245, 32245, 32245],
       [32243,   122,    15, ..., 32245, 32245, 32245],
       ...,
       [32243,    55,    36, ..., 32245, 32245, 32245],
       [32243,    55,    36, ..., 32245, 32245, 32245],
       [32243,    55,    36, ..., 32245, 32245, 32245]])