## Importing

In [1]:
# !pip install transformers -q

[K     |████████████████████████████████| 2.6 MB 5.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 59.5 MB/s 
[K     |████████████████████████████████| 636 kB 63.0 MB/s 
[K     |████████████████████████████████| 895 kB 74.0 MB/s 
[?25h

In [15]:
import os
import csv
import time
import warnings

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import transformers
import tensorflow_hub as hub

from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

## Config

In [4]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.68.233.66:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.68.233.66:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU')]
INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [5]:
BATCH_SIZE = 256
MAX_LEN = 250

In [None]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

## Loading data

In [8]:
df = pd.read_csv('/content/drive/MyDrive/AmazonML/input/dataset/train.csv', 
                    usecols=['TITLE', 'BULLET_POINTS', 'BROWSE_NODE_ID'],
                    escapechar='\\', quoting=csv.QUOTE_NONE)
df.fillna('nan', inplace=True)

## Prepraing dataset

In [9]:
temp = df.BROWSE_NODE_ID.value_counts()
value_counts = {}
for i, j in zip(temp.index, temp):
  value_counts[i] = j

In [12]:
df['Threshold'] = df['BROWSE_NODE_ID'].map(lambda x: 1 if value_counts[x] >= 300 else 0)
np.sum(df['Threshold']) / len(df) # dataset percentage coverage

0.8698092058487976

In [13]:
df['label'] = df['BROWSE_NODE_ID'] * df['Threshold']
df['label'].nunique() # How many classes we have in the threshold

1299

In [14]:
new_df = df[df['Threshold'] == 1]
new_df = new_df.sample(frac=1, random_state=1234).reset_index(drop=True)

In [16]:
# label encoding
le = LabelEncoder()
le.fit(new_df['label'])
transformed = le.transform(new_df['label'])

inverse = {}
for i, j in zip(transformed, new_df['label'].values):
  inverse[i] = j

new_df['label'] = transformed
num_classes = new_df['label'].nunique()

In [18]:
train_df, valid_df = train_test_split(new_df, stratify=new_df['label'], random_state=1234)
train_df['text'] = train_df['TITLE'] + " " + train_df['BULLET_POINTS']
valid_df['text'] = valid_df['TITLE'] + " " + valid_df['BULLET_POINTS']

train_text = train_df['text'].values
valid_text = valid_df['text'].values

train_label = train_df['label'].values
valid_label = valid_df['label'].values

In [19]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




## Creating model

In [20]:
def create_model():
    bert = transformers.TFBertModel.from_pretrained('bert-base-uncased', 
                                                    output_attentions=False, 
                                                    output_hidden_states=False)
    inputs = tf.keras.layers.Input((None,), dtype=tf.int32)
    mask = tf.keras.layers.Input((None,), dtype=tf.int32)
    bert_outputs = bert(inputs, attention_mask=mask)[1]
    preds = tf.keras.layers.Dense(num_classes, activation='softmax')(bert_outputs)
    return tf.keras.Model([inputs, mask], preds)

## Training Function

In [21]:
print('creating model')
with strategy.scope():
  model = create_model()
  optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
  loss_fun = tf.keras.losses.SparseCategoricalCrossentropy()
  model.compile(optimizer=optimizer, loss=loss_fun, metrics=['acc'])

creating model


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported


Cause: while/else statement not yet supported


Cause: while/else statement not yet supported




Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [22]:
def train_chunk(start, end):
  
  valid_idx = np.random.randint(0, len(valid_text)-1, size=25000)
  
  valid_text_chunk = list(valid_text[valid_idx])
  train_text_chunk = list(train_text[start:end])

  train_label_chunk = train_label[start:end]
  valid_label_chunk = valid_label[valid_idx]

  start = time.time()
  print('Tokenizing data')
  train_data = tokenizer(train_text_chunk, padding=True, truncation=True, max_length=MAX_LEN)
  valid_data = tokenizer(valid_text_chunk, padding=True, truncation=True, max_length=MAX_LEN)
  print('Tokenization complete, time taken = ', time.time() - start)

  start = time.time()
  print('creating dataset')
  train_dataset = tf.data.Dataset.from_tensor_slices(((train_data['input_ids'], 
                                                       train_data['attention_mask']), 
                                                       train_label_chunk)).batch(BATCH_SIZE)

  valid_dataset = tf.data.Dataset.from_tensor_slices(((valid_data['input_ids'], 
                                                       valid_data['attention_mask']), 
                                                       valid_label_chunk)).batch(BATCH_SIZE)
  print('dataset created, time taken = ', time.time() - start)
  
  print('TRAINING')
  model.fit(train_dataset,
            epochs=2,
            validation_data=valid_dataset)

  print('Saving weights')
  model.save_weights('/content/drive/MyDrive/AmazonML/tpu_model.h5')                                            

## Training the model

In [23]:
len(train_df)

1893807

In [24]:
start = 0
for i in range(100000, 1000000, 100000):
  train_chunk(start, i)
  start = i

Tokenizing data
Tokenization complete, time taken =  17.521661281585693
creating dataset
dataset created, time taken =  163.76400065422058
TRAINING
Epoch 1/2


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 250) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 250) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None,) dtype=int64>]








INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 250) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 250) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None,) dtype=int64>]












INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 250) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 250) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None,) dtype=int64>]










Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  20.27382755279541
creating dataset
dataset created, time taken =  165.28234601020813
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  21.129508018493652
creating dataset
dataset created, time taken =  165.91782641410828
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  21.51436686515808
creating dataset
dataset created, time taken =  165.53545331954956
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  20.564024925231934
creating dataset
dataset created, time taken =  165.85315036773682
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  20.564054012298584
creating dataset


KeyboardInterrupt: ignored

In [None]:
start = 0
for i in range(600000, 1000000, 100000):
  train_chunk(start, i)
  start = i

## Predicting on test set

In [25]:
model.load_weights('/content/drive/MyDrive/AmazonML/tpu_model.h5')
test = pd.read_csv('/content/drive/MyDrive/AmazonML/input/dataset/test.csv', escapechar='\\', quoting=csv.QUOTE_NONE)
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [26]:
test.fillna('nan', inplace=True)
len(test)

110775

In [27]:
test_text = test['TITLE'] + " " + test['BULLET_POINTS']

In [28]:
test_data = tokenizer(list(test_text), padding=True, truncation=True, max_length=MAX_LEN)

In [30]:
test_dataset = tf.data.Dataset.from_tensor_slices(((test_data['input_ids'], 
                                                    test_data['attention_mask']), test['PRODUCT_ID'].values)).batch(BATCH_SIZE)

In [31]:
product_ids = []
sample_preds = []
for i in tqdm(test_dataset):
  product_ids += list(i[1].numpy())
  sample_preds += list(np.argmax(model.predict(i[0]), axis=1))

HBox(children=(FloatProgress(value=0.0, max=433.0), HTML(value='')))







INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 250) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 250) dtype=int32>]













In [32]:
# Temp sub
sub = pd.DataFrame()
sub['PRODUCT_ID'] = product_ids
sub['BROWSE_NODE_ID'] = sample_preds
sub['BROWSE_NODE_ID'] = sub['BROWSE_NODE_ID'].map(inverse)

In [33]:
sub.to_csv('/content/drive/MyDrive/AmazonML/BertBaselineSubmission.csv', index=False)

In [None]:
sub.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,15772
2,3,4821
3,4,125
4,5,8915


In [34]:
sub.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,3329
2,3,8390
3,4,125
4,5,7644
