This notebook is a part of the project [Generating-categories-from-arXiv-paper-titles](https://github.com/sayakpaul/Generating-categories-from-arXiv-paper-titles). It shows how to use Cloud TPUs to train `tf.keras` models. If you want to be able to do the same take a look at these [instructions](http://bit.ly/keras-tpu-tf21) prepared by [Martin Görner](https://twitter.com/martin_gorner). 

Also, an insane amount of thanks to the [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc) program for providing me the access to v3 Cloud TPUs.

## Data gathering

In [1]:
# Imports
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import class_weight
from ast import literal_eval
from typing import Union
from utils import utils
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
tf.__version__

'2.1.0-dev20191029'

## Data gathering and preprocessing

In [2]:
# Data gathering
def load_data(filename:str) -> np.ndarray:
    content = np.load(filename, allow_pickle=True)
    return content

In [3]:
X_train, y_train = load_data('data/X_train.npy'), load_data('data/y_train.npy')
X_test, y_test = load_data('data/X_test.npy'), load_data('data/y_test.npy')

X_train.shape, X_test.shape

((26152,), (6538,))

In [4]:
clean_title = np.vectorize(utils.clean_title)

In [5]:
X_train = clean_title(X_train)
X_test = clean_title(X_test)

In [6]:
# Preview
X_train[:10]

array(['deblurgan blind motion deblurring using conditional adversarial networks',
       'improve satsolving machine learning',
       'training adversarial discriminators crosschannel abnormal event detection crowds',
       'collective stability networks winnertakeall circuits',
       'sample complexity episodic fixedhorizon reinforcement learning',
       'visualizing textual models intext wordaspixel highlighting',
       'prophit causal inverse classification multiple continuously valued treatment policies',
       'sequential dual deep learning shape texture features sketch recognition',
       'notes using determinantal point processes clustering applications text clustering',
       'exactly robust kernel principal component analysis'],
      dtype='<U185')

In [7]:
tokenizer = Tokenizer(num_words=3000, lower=True)
tokenizer.fit_on_texts(X_train)

In [8]:
def get_features(text_sequence: np.ndarray) -> np.ndarray:
    sequences = tokenizer.texts_to_sequences(text_sequence)
    return pad_sequences(sequences, maxlen=300)

train_features = get_features(X_train)
test_features = get_features(X_test)

In [9]:
train_features.shape, test_features.shape

((26152, 300), (6538, 300))

In [10]:
y_train[:10]

array(["['cs.CV']", "['cs.AI', 'cs.LO']", "['cs.CV']", "['cs.NE']",
       "['stat.ML', 'cs.AI', 'cs.LG']", "['stat.ML', 'cs.CL', 'cs.LG']",
       "['cs.LG', 'stat.ML']", "['cs.CV']", "['cs.LG']",
       "['cs.LG', 'stat.ML']"], dtype=object)

In [11]:
type(train_features), type(y_train)

(numpy.ndarray, numpy.ndarray)

In [12]:
# Label binarization
list_preprocessed = [literal_eval(i) for i in y_train]
mlb = MultiLabelBinarizer()
y_train_binarized = mlb.fit_transform(list_preprocessed)
mlb.classes_

array(['cs.AI', 'cs.CC', 'cs.CE', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY',
       'cs.DB', 'cs.DS', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT',
       'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.NE', 'cs.PL', 'cs.RO',
       'cs.SD', 'cs.SE', 'cs.SI', 'math.IT', 'math.OC', 'math.ST',
       'stat.AP', 'stat.CO', 'stat.ME', 'stat.ML', 'stat.TH'],
      dtype=object)

In [13]:
y_train_binarized[:10]

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0,

In [14]:
y_train_binarized[0].shape

(32,)

In [15]:
y_test_binarized = mlb.transform([literal_eval(i) for i in y_test])
y_test_binarized[:10]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0,

In [16]:
class_weight = class_weight.compute_sample_weight('balanced', y_train)
class_weight

array([0.03675098, 1.08066116, 0.03675098, ..., 0.03675098, 4.84296296,
       0.08631023])

## Configure cloud TPUs

In [17]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  ['192.168.51.2:8470']
INFO:tensorflow:Initializing the TPU system: tpu-vm


INFO:tensorflow:Initializing the TPU system: tpu-vm


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


REPLICAS:  8


## Preparation for model training

In [28]:
# Helper function to return a CNN-based model
def get_a_cnn_model() -> tf.keras.models.Sequential:
    model = Sequential()
    model.add(Embedding(3000, 30, 
        input_length=300))
    model.add(Dropout(0.1))
    model.add(Conv1D(300, 3, 
        padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPool1D())
    model.add(Dense(32, activation='sigmoid'))
    
    return model

In [31]:
def compile_model_w_strat(model:tf.keras.models.Sequential, function_name) -> tf.keras.models.Sequential:
    with strategy.scope():
        model = function_name()
        model.compile(optimizer='adam', 
                        loss='binary_crossentropy', 
                        metrics=['categorical_accuracy'])
        
        return model

In [32]:
compiled_model = compile_model_w_strat(model, get_a_cnn_model)

In [33]:
compiled_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 300, 30)           90000     
_________________________________________________________________
dropout_5 (Dropout)          (None, 300, 30)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 298, 300)          27300     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                9632      
Total params: 126,932
Trainable params: 126,932
Non-trainable params: 0
_________________________________________________________________


## And we are good to go

In [25]:
compiled_model.fit(train_features, y_train_binarized,
                        class_weight=class_weight,
                        steps_per_epoch=len(train_features)//(8 * strategy.num_replicas_in_sync),
                        validation_data=(test_features, y_test_binarized),
                        validation_steps=len(test_features)//(8 * strategy.num_replicas_in_sync),
                        epochs=15,
                        batch_size=8 * strategy.num_replicas_in_sync)

Train on 26152 samples, validate on 6538 samples
Epoch 1/15

<tensorflow.python.keras.callbacks.History at 0x7f5f841c3208>

In [34]:
def get_a_sequential_model():
    model = Sequential()
    model.add(Embedding(3000, 20, input_length=300))
    model.add(Bidirectional(LSTM(10, activation="sigmoid")))
    model.add(Dense(32, activation='sigmoid'))
    
    return model

In [35]:
compiled_model = compile_model_w_strat(model, get_a_sequential_model)

In [36]:
compiled_model.fit(train_features, y_train_binarized,
                        class_weight=class_weight,
                        steps_per_epoch=len(train_features)//(8 * strategy.num_replicas_in_sync),
                        validation_data=(test_features, y_test_binarized),
                        validation_steps=len(test_features)//(8 * strategy.num_replicas_in_sync),
                        epochs=15,
                        batch_size=8 * strategy.num_replicas_in_sync)

Train on 26152 samples, validate on 6538 samples
Epoch 1/15

<tensorflow.python.keras.callbacks.History at 0x7f5efc4c0470>

**Justification on lower performance** (provided by Martin himself):

![](https://i.ibb.co/RQcyN7H/Screen-Shot-2019-11-02-at-9-50-49-AM.png)

GitHub issue [thread](https://github.com/GoogleCloudPlatform/training-data-analyst/issues/678). 