In [0]:
data_folder = 'drive/text_summarization/data'
train_history_filename = "train.history.pkl"
train_weights_filename = "train_weights.hdf5"

In [4]:
import os
import pickle
import keras
import warnings
from keras.optimizers import Adam
warnings.filterwarnings('ignore')
keras.__version__

Using TensorFlow backend.


'2.1.6'

In [0]:
vocabulary_embeddings_path = os.path.join(data_folder, 'vocabulary-embedding')

In [0]:
maxlend=25 # 0 - if we dont want to use description at all
maxlenh=25
maxlen = maxlend + maxlenh
rnn_size = 128 # was 512 must be same as 160330-word-gen
rnn_layers = 3  # was 3 - match weights_filename (FN1)
batch_norm=False

In [0]:
activation_rnn_size = 40 if maxlend else 0

In [0]:
# training parameters
seed=42
p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0
optimizer = 'adam'
learning_rate = 1e-4 # 1e-4
batch_size=64
nflips=0 # was 10 , need to try different 

In [0]:
with open("{}.pkl".format(vocabulary_embeddings_path), "rb") as fp:
    embedding, idx2word, word2idx, glove_idx2idx = pickle.load(fp)
vocab_size, embedding_size = embedding.shape

In [7]:
with open(train_history_filename, "rb") as fp:
    history = pickle.load(fp)


FileNotFoundError: ignored

In [0]:
with open("{}.data.pkl".format(vocabulary_embeddings_path), "rb") as fp:
    X, Y = pickle.load(fp)

In [0]:
nb_unknown_words = 100 # was 10

In [0]:
for i in range(nb_unknown_words):
    idx2word[vocab_size - i - 1] = "<{}>".format(i)

In [0]:
# mark oov words with "^" in the end
oov0 = vocab_size - nb_unknown_words
for i in range(oov0, len(idx2word)):
    idx2word[i] = idx2word[i] + '^'

In [20]:
# this is done only because of our restriction on GPU capacity
from sklearn.cross_validation import train_test_split

reduce_sample_size = 10 # 100 worked ok - this are our previious weights
new_example_size = len(X) // reduce_sample_size
ratio_val_samples = 0.1
nb_train_samples = int(new_example_size * (1-ratio_val_samples))
nb_val_samples = int(new_example_size * 0.1)

X_train, X_test, Y_train, Y_test = train_test_split(X[:new_example_size], Y[:new_example_size], test_size=nb_val_samples, random_state=seed)
print(len(X_train), len(Y_train), len(X_test), len(Y_test))

90000 90000 10000 10000




In [0]:
del X
del Y

In [0]:
empty = 0
eos = 1
idx2word[empty] = '_'
idx2word[eos] = '~'

In [0]:
import numpy as np
import random, sys


# seed weight initialization
random.seed(seed)
np.random.seed(seed)

In [24]:
# show how data looks like    
def print_sample(label, sample):
    print(label + ':', end=' '),
    for index in sample:
        print(INDEX_TO_WORD[index], end=' '),
    print()
    
    
print_sample('H', Y_train[334])
print_sample('D', X_train[334])
print_sample('H', Y_test[334])
print_sample('D', X_test[334])

H: Shiplap^ for the Ceiling . 
D: I like wood..^ 
H: Review of The First Confessor^ 
D: Book # 36 . The First Confessor^ , by Terry Goodkind^ . 


In [0]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.layers import Merge
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.preprocessing import sequence
from keras.utils import np_utils

In [0]:
regularizer = l2(weight_decay) if weight_decay else None

In [0]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_size,
                    input_length=maxlen,
                    W_regularizer=regularizer, dropout=p_emb, weights=[embedding], mask_zero=True,
                    name='embedding_1'))
for i in range(rnn_layers):
    lstm = LSTM(rnn_size, return_sequences=True, # batch_norm=batch_norm,
                W_regularizer=regularizer, U_regularizer=regularizer,
                b_regularizer=regularizer, dropout_W=p_W, dropout_U=p_U,
                name='lstm_{}'.format(i+1)
                  )
    model.add(lstm)
    model.add(Dropout(p_dense, name='dropout_{}'.format(i+1)))

In [0]:
from keras.layers.core import Lambda
import keras.backend as K

def simple_context(X, mask, n=activation_rnn_size, maxlend=maxlend, maxlenh=maxlenh):
    desc, head = X[:,:maxlend,:], X[:,maxlend:,:]
    head_activations, head_words = head[:,:,:n], head[:,:,n:]
    desc_activations, desc_words = desc[:,:,:n], desc[:,:,n:]
    
    activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2,2))
    # make sure we dont use description words that are masked out
    if mask != None:
      activation_energies = activation_energies + -1e20*K.expand_dims(1.-K.cast(mask[:, :maxlend], 'float32'), 1)
    
    # for every head word compute weights for every desc word
    activation_energies = K.reshape(activation_energies,(-1,maxlend))
    activation_weights = K.softmax(activation_energies)
    activation_weights = K.reshape(activation_weights,(-1,maxlenh,maxlend))

    # for every head word compute weighted average of desc words
    desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2,1))
    return K.concatenate((desc_avg_word, head_words))


class SimpleContext(Lambda):
    def __init__(self, **kwargs):
        super(SimpleContext, self).__init__(simple_context, **kwargs)
        self.supports_masking = True

        
    def compute_mask(self, input, input_mask=None):
        return input_mask[:, maxlend:]
    
    
    def get_output_shape_for(self, input_shape):
        nb_samples = input_shape[0]
        n = 2 * (rnn_size - activation_rnn_size)
        return (nb_samples, maxlenh, n)

In [0]:
if activation_rnn_size:
    model.add(SimpleContext(name='simplecontext_1'))

model.add(TimeDistributed(Dense(vocab_size,
                                W_regularizer=regularizer, b_regularizer=regularizer,
                                name = 'timedistributed_1')))
model.add(Activation('softmax', name='activation_1'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [0]:
K.set_value(model.optimizer.lr, np.float32(learning_rate))

In [0]:
def str_shape(x):
    return 'x'.join(map(str, x.shape))
    
def inspect_model(model):
    for i, l in enumerate(model.layers):
        print (i, 'cls={} name={}'.format(type(l).__name__, l.name), end=' ')
        weights = l.get_weights()
        for weight in weights:
            print(str_shape(weight), end='')
        print()

In [33]:
inspect_model(model)

0 cls=Embedding name=embedding_1 40000x100
1 cls=LSTM name=lstm_1 100x512128x512512
2 cls=Dropout name=dropout_1 
3 cls=LSTM name=lstm_2 128x512128x512512
4 cls=Dropout name=dropout_2 
5 cls=LSTM name=lstm_3 128x512128x512512
6 cls=Dropout name=dropout_3 
7 cls=SimpleContext name=simplecontext_1 
8 cls=TimeDistributed name=time_distributed_1 176x4000040000
9 cls=Activation name=activation_1 


# Load Weights

In [3]:
train_weights_filepath = os.path.join(data_folder, train_weights_filename)
if os.path.exists(train_weights_filepath):
    print("Loading model weights from {}".format(train_weights_filepath))
    model.load_weights(train_weights_filepath)

NameError: ignored

# Test

In [0]:
def lpadd(x, maxlend=maxlend, eos=eos):
    if maxlend == 0:
        return [eos]
    n = len(x)
    if n > maxlend:
        x = x[-maxlend:]
        n = maxlend
    return [empty]*(maxlend-n) + x + [eos]

In [0]:
samples = [lpadd([3]*26)]
data = sequence.pad_sequences(samples, maxlen=maxlen, value=empty, padding='post', truncating='post')

In [40]:
np.all(data[:,maxlend] == eos)

True

In [41]:
print(data.shape, [sample_len for sample_len in map(len, samples)])

(1, 50) [26]


In [42]:
# Yura added this because if we don't have weights there is no sense in predicting at this stage

if os.path.exists(train_weights_filepath):
  print("Predicting from loaded weights")  
  probs = model.predict(data, verbose=0, batch_size=1)
  print(probs.shape)

Predicting from loaded weights
(1, 25, 40000)


In [0]:
def vocab_fold(xs):
    xs = [x if x < oov0 else glove_idx2idx.get(x,x) for x in xs]
    outside = sorted([x for x in xs if x >= oov0])
    outside = dict((x,vocab_size-1-min(i, nb_unknown_words-1)) for i, x in enumerate(outside))
    xs = [outside.get(x,x) for x in xs]
    return xs

In [0]:
def vocab_unfold(desc,xs):
    unfold = {}
    for i, unfold_idx in enumerate(desc):
        fold_idx = xs[i]
        if fold_idx >= oov0:
            unfold[fold_idx] = unfold_idx
    return [unfold.get(x,x) for x in xs]

In [0]:
def flip_headline(x, nflips=None, model=None, debug=False):
    if nflips is None or model is None or nflips <= 0:
        return x
    
    batch_size = len(x)
    probs = model.predict(x, verbose=0, batch_size=batch_size)
    x_out = x.copy()
    for b in range(batch_size):
        flips = sorted(random.sample(range(maxlend+1, maxlen), nflips))
        if debug and b < debug:
            print(b)
        for input_idx in flips:
            if x[b, input_idx] == empty or x[b, input_idx] == eos:
                continue
            label_idx = input_idx - (maxlend+1)
            prob = probs[b, label_idx]
            w = prob.argmax()
            if w == empty:
                w = oov0
            if debug and b < debug:
                print('{} => {}'.format(idx2word[x_out[b,input_idx]], idx2word[w]), end=' '),
            x_out[b,input_idx] = w
        if debug and b < debug:
            print()
    return x_out

In [0]:
def conv_seq_labels(xds, xhs, nflips=None, model=None, debug=False):
    batch_size = len(xhs)
    assert len(xds) == batch_size
    x = [vocab_fold(lpadd(xd)+xh) for xd,xh in zip(xds,xhs)] 
    x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
    x = flip_headline(x, nflips=nflips, model=model, debug=debug)
    
    y = np.zeros((batch_size, maxlenh, vocab_size))
    for i, xh in enumerate(xhs):
        xh = vocab_fold(xh) + [eos] + [empty]*maxlenh  
        xh = xh[:maxlenh]
        y[i,:,:] = np_utils.to_categorical(xh, vocab_size)
        
    return x, y

In [0]:
def gen(Xd, Xh, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, seed=seed):
    """yield batches. for training use nb_batches=None
    for validation generate deterministic results repeating every nb_batches
    
    while training it is good idea to flip once in a while the values of the headlines from the
    value taken from Xh to value generated by the model.
    """
    c = nb_batches if nb_batches else 0
    while True:
        xds = []
        xhs = []
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, 1e6)
        random.seed(c+123456789+seed)
        for b in range(batch_size):
            t = random.randint(0,len(Xd)-1)

            xd = Xd[t]
            s = random.randint(min(maxlend,len(xd)), max(maxlend,len(xd)))
            xds.append(xd[:s])
            
            xh = Xh[t]
            s = random.randint(min(maxlenh,len(xh)), max(maxlenh,len(xh)))
            xhs.append(xh[:s])

        # undo the seeding before we yield inorder not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(xds, xhs, nflips=nflips, model=model, debug=debug)

In [52]:
r = next(gen(X_train, Y_train, batch_size=batch_size))
r[0].shape, r[1].shape, len(r)

((64, 50), (64, 25, 40000), 2)

In [0]:
def test_gen(gen, n=5):
    Xtr,Ytr = next(gen)
    for i in range(n):
        x = Xtr[i, :maxlend]
        y = Xtr[i, maxlend:]
        yy = Ytr[i,:]
        yy = np.where(yy)[1]
        print_sample('L', yy)
        print_sample('H', y)
        if maxlend:
            print_sample('D', x)

In [54]:
test_gen(gen(X_train, Y_train, batch_size=batch_size))

L: Judge : Karen Buckley 's murder was a <0>^ , jaw-dropping attack on a helpless young woman ' ~ _ _ _ _ _ _ 
H: ~ Judge : Karen Buckley 's murder was a <0>^ , jaw-dropping attack on a helpless young woman ' _ _ _ _ _ _ 
D: of chemicals has been jailed for a minimum of 23 years . Alexander Pacteau , 21 , was branded <1>^ <2>^ after admitting the crime 
L: ASE Inc. Management Team 's Visit to WOD 's Chairman Mr. <0>^ Lin ~ _ _ _ _ _ _ _ _ _ _ _ 
H: ~ ASE Inc. Management Team 's Visit to WOD 's Chairman Mr. <0>^ Lin _ _ _ _ _ _ _ _ _ _ _ 
D: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ TAIPEI , Taiwan , <1>^ . 
L: OROP crisis : Veterans meet Manohar Parrikar , satisfied with clarification on <0>^ ~ _ _ _ _ _ _ _ _ _ _ _ 
H: ~ OROP crisis : Veterans meet Manohar Parrikar , satisfied with clarification on <0>^ _ _ _ _ _ _ _ _ _ _ _ 
D: , who met the Minister along with few others , for the second time today , said the veterans would take a call on <1>^ 
L: OSU football : <0>^ gives the linemen a

In [55]:
test_gen(gen(X_train, Y_train, nflips=6, model=model, debug=False, batch_size=batch_size))

L: Judge : Karen Buckley 's murder was a <0>^ , jaw-dropping attack on a helpless young woman ' ~ _ _ _ _ _ _ 
H: ~ ~ : Karen ~ ~ murder was ~ <0>^ , jaw-dropping attack on a helpless young ~ ' _ _ _ _ _ _ 
D: of chemicals has been jailed for a minimum of 23 years . Alexander Pacteau , 21 , was branded <1>^ <2>^ after admitting the crime 
L: ASE Inc. Management Team 's Visit to WOD 's Chairman Mr. <0>^ Lin ~ _ _ _ _ _ _ _ _ _ _ _ 
H: ~ ASE Inc. Management Team 's ~ to WOD ~ Chairman ~ <0>^ Lin _ _ _ _ _ _ _ _ _ _ _ 
D: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ TAIPEI , Taiwan , <1>^ . 
L: OROP crisis : Veterans meet Manohar Parrikar , satisfied with clarification on <0>^ ~ _ _ _ _ _ _ _ _ _ _ _ 
H: ~ OROP ~ : Veterans meet Manohar Parrikar , satisfied with clarification ~ <0>^ _ _ _ _ _ _ _ _ _ _ _ 
D: , who met the Minister along with few others , for the second time today , said the veterans would take a call on <1>^ 
L: OSU football : <0>^ gives the linemen a fighting chance ~ _ _ _ _ _

In [0]:
valgen = gen(X_test, Y_test,nb_batches=3, batch_size=batch_size)

In [57]:
for i in range(4):
    test_gen(valgen, n=1)

L: <0>^ : Calm down , opposition told ~ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
H: ~ <0>^ : Calm down , opposition told _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
D: years last a long , long time . The latest episode highlighting Mugabe 's shortcomings occurred Tuesday , when the 91-year-old delivered a speech to 
L: Down the Garden Path : Discover the Jenson Food Cookbook , and <0>^ garden ~ _ _ _ _ _ _ _ _ _ _ 
H: ~ Down the Garden Path : Discover the Jenson Food Cookbook , and <0>^ garden _ _ _ _ _ _ _ _ _ _ 
D: Canberra 's kitchen gardeners will find plenty to see , do and learn at Down the Garden Path , which is a fundraising event being 
D: to stay away from beaches as small waves from the massive Chile earthquake reach New Zealand Christchurch Civil Defence and Emergency Management ( <1>^ ) 
L: <0>^ : Calm down , opposition told ~ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
H: ~ <0>^ : Calm down , opposition told _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
D: years last a long , long time . The latest episode

# Train

In [0]:
# history = {}

In [1]:
# Anya: had concerns about the about of memory we had on GPU on early steps of testing funcionality. Now seems like we don't need this part.
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " I Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/45/99/837428d26b47ebd6b66d6e1b180e98ec4a557767a93a81a02ea9d6242611/GPUtil-1.3.0.tar.gz
Building wheels for collected packages: gputil
  Running setup.py bdist_wheel for gputil ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/17/0f/04/b79c006972335e35472c0b835ed52bfc0815258d409f560108
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.3.0
Collecting humanize
  Downloading https://files.pythonhosted.org/packages/8c/e0/e512e4ac6d091fc990bbe13f9e0378f34cf6eecd1c6c268c9e598dcf5bb9/humanize-0.5.1.tar.gz
Building wheels for collected packages: humanize
  Running setup.py bdist_wheel for humanize ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/69/86/6c/f8b8593bc273ec4b0c653d3827f7482bb2001a2781a73b7f44
Successfully built humanize
Installing collected packages: humanize
Successfully installed humanize-0.5.1
Gen RAM Free: 12.7 G

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

!ls



In [58]:
from collections import defaultdict
history = defaultdict(list)

traingen = gen(X_train, Y_train, batch_size=batch_size, nflips=nflips, model=model)
valgen = gen(X_test, Y_test, nb_batches=nb_val_samples//batch_size, batch_size=batch_size)

r = next(traingen)
print(r[0].shape, r[1].shape, len(r))

(64, 50) (64, 25, 40000) 2


In [59]:
for iteration in range(10): #500
    print('Iteration: {}'.format(iteration))
    h = model.fit_generator(traingen, 
                            steps_per_epoch=nb_train_samples//batch_size,
                            epochs=1, 
                            validation_data=valgen,
                            validation_steps = 1 #nb_val_samples//batch_size
                           )
    for k, v in h.history.items():
        history[k] = history[k] + v
    # train_history_filepath = os.path.join(data_folder, train_history_filename)
    with open(train_history_filename, 'wb') as filepath:
        pickle.dump(history, filepath, -1)    
    model.save_weights(train_weights_filename, overwrite=True)
    
    # Create & upload created files from collab's workspace.
    for filename in (train_history_filename,train_weights_filename):
      uploaded = drive.CreateFile({'title': filename})
      uploaded.SetContentFile(filename)
      uploaded.Upload()
      print('Uploaded file with ID {}'.format(uploaded.get('id')))
    

Iteration: 0
Epoch 1/1



NameError: ignored