In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np

import time

tf.compat.v1.enable_eager_execution()

Preprocessing

In [164]:
data = open('/content/drive/My Drive/transformer_classification/train_data.txt')

lines = data.readlines()
sent_data = []
label_data = []

total_char = set()
total_label = set()
total_data_num = 0

for line in lines:
    line = line.replace('\n', '')
    sent_data.append(line.split('\t')[1])
    label_data.append(int(line.split('\t')[0]))
    total_label.add(int(line.split('\t')[0]))
    total_data_num += 1
    for chars in line.split('\t')[1]:
        total_char.add(chars)

print('total char num: ' + str(len(total_char)))
print('total data num: ' + str(total_data_num))

data.close()

# label_num = len(total_label)
label_num = 508
print('total label num: ' + str(label_num))
vocabs = tfds.features.text.SubwordTextEncoder.build_from_corpus((sent for sent in sent_data), target_vocab_size=1600)

total char num: 1254
total data num: 1001
total label num: 508


In [165]:
print(vocabs.subwords[0:10])
print(vocabs.vocab_size)

['・', 'の', 'ル', 'ス', '日本の', 'ン', 'ー', 'イ', 'の選', 'ト']
1659


In [166]:
print(sent_data[0])
tokenized_string = vocabs.encode(sent_data[0])
for ts in tokenized_string:
    print ('{}: {}'.format(vocabs.decode([ts]), ts))

鹿児島県の鉄道日本の鉄道駅おおすみおおかわ九州旅客鉄道の鉄道日豊本線おおすみおおかわらえ日本国有鉄道の鉄道1931年開業の鉄道曽於市の交通おおすみおおかわらえ曽於市の建築
�: 1636
�: 1588
�: 1594
�: 1632
�: 1536
�: 1547
島県の: 1177
鉄道日本の鉄道駅: 1106
おお: 227
すみ: 1063
おお: 227
かわ: 650
九州旅客鉄道の鉄道: 1210
日: 76
豊: 1117
本線: 1150
おお: 227
すみ: 1063
おお: 227
かわ: 650
ら: 140
え: 21
日本国有鉄道の鉄道: 280
193: 359
1年開業の鉄道: 1390
�: 1633
�: 1558
�: 1592
�: 1633
�: 1553
�: 1591
市の交通: 305
おお: 227
すみ: 1063
おお: 227
かわ: 650
ら: 140
え: 21
�: 1633
�: 1558
�: 1592
�: 1633
�: 1553
�: 1591
市の建築: 685


In [0]:
# Make one hot vectors
# label_one_hot_vector =  np.zeros((total_data_num, label_num))
# for i, data in enumerate(label_data):
#     label_one_hot_vector[i, data] = 1

# print(label_one_hot_vector[0])

# lines_dataset = (
#     tf.data.Dataset.from_tensor_slices(
#         (
#             tf.cast(np.array(sent_data), tf.string),
#             tf.cast(np.array(label_one_hot_vector), tf.float32)           
#         )
#     )
# )


In [0]:
lines_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(np.array(sent_data), tf.string),
            tf.cast(np.array(label_data), tf.float32)           
        )
    )
)


In [0]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
MAX_LENGTH = 256

In [0]:
def encode(sent, label):
    sent = [vocabs.vocab_size] + vocabs.encode(sent.numpy()) + [vocabs.vocab_size+1]
    # return sent, label
    return sent, [label]

In [0]:
def filter_max_length(x, y, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [0]:
def tf_encode(sent,label):
    return tf.py_function(encode, [sent,label], [tf.int64, tf.float32])

In [0]:
train_dataset = lines_dataset.map(tf_encode)
# train_dataset = train_dataset.filter(filter_max_length)
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [400]:
train_dataset

<DatasetV1Adapter shapes: ((?, ?), (?, ?)), types: (tf.int64, tf.float32)>

In [401]:
sent_batch, label_batch = next(iter(train_dataset))
sent_batch, label_batch

(<tf.Tensor: id=212049, shape=(64, 104), dtype=int64, numpy=
 array([[1659,  151,  773, ...,    0,    0,    0],
        [1659,  271,   80, ...,    0,    0,    0],
        [1659,  118,  228, ...,    0,    0,    0],
        ...,
        [1659,  869,    6, ...,    0,    0,    0],
        [1659,  470,    2, ...,    0,    0,    0],
        [1659, 1126, 1047, ...,    0,    0,    0]])>,
 <tf.Tensor: id=212050, shape=(64, 1), dtype=float32, numpy=
 array([[337.],
        [361.],
        [476.],
        [418.],
        [418.],
        [148.],
        [ 60.],
        [337.],
        [ 35.],
        [233.],
        [133.],
        [200.],
        [ 53.],
        [418.],
        [ 99.],
        [153.],
        [ 80.],
        [144.],
        [  0.],
        [200.],
        [300.],
        [162.],
        [222.],
        [275.],
        [ 81.],
        [267.],
        [170.],
        [263.],
        [265.],
        [418.],
        [425.],
        [153.],
        [370.],
        [222.],
        [234

Build model

In [0]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                              np.arange(d_model)[np.newaxis, :],
                              d_model)
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])

    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [0]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

In [0]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

In [0]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)

    return output, attention_weights

In [0]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
    
        assert d_model % self.num_heads == 0
    
        self.depth = d_model // self.num_heads
    
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
    
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
    
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
    
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
    
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
    
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)
        
        return output, attention_weights

In [0]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([tf.keras.layers.Dense(dff, activation='relu'), tf.keras.layers.Dense(d_model)])

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):

        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
    
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
    
        return out2

In [0]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)


        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        

    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        x = self.embedding(x) 
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
        
        return x 



In [0]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
                   target_vocab_size, label_num, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                               input_vocab_size, rate)

        self.dense1 = tf.keras.layers.Dense(d_model, activation='tanh')
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.final_layer = tf.keras.layers.Dense(label_num, activation='softmax')
        

    def call(self, inp, training, enc_padding_mask):

        enc_output = self.encoder(inp, training, enc_padding_mask)

        enc_output = self.dense1(enc_output[:,0])
        enc_output = self.dropout1(enc_output, training=training)
        final_output = self.final_layer(enc_output)

        return final_output

Training

In [0]:
num_layers = 6
d_model = 256
dff = 512
num_heads = 8

input_vocab_size = vocabs.vocab_size + 2
target_vocab_size = vocabs.vocab_size + 2
dropout_rate = 0.1

In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
    
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
    
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [0]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [0]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [0]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

In [0]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [0]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, label_num, dropout_rate)


In [0]:
def create_masks(inp):
    enc_padding_mask = create_padding_mask(inp)
    
    return enc_padding_mask

In [0]:
checkpoint_path = "/content/drive/My Drive/transformer_classification/model_output"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint is restored')



In [0]:
EPOCHS = 50

In [0]:
@tf.function
def train_step(inp, tar):
    enc_padding_mask = create_masks(inp)

    with tf.GradientTape() as tape:
        predictions = transformer(inp, True, enc_padding_mask)
        loss = loss_function(tar, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar, predictions)

In [0]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> imdb comments, tar -> polarity of the comments
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)

        if batch % 20 == 0:
            print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

    print ('Epoch {} Train_Loss {:.4f} Train_Accuracy {:.4f}'.format(epoch + 1, 
                                                    train_loss.result(), 
                                                    train_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))