## Importing

In [1]:
!pip install transformers -q



In [22]:
import os
import csv
import time
import warnings

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import transformers
import tensorflow_hub as hub

from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

## Config

In [4]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

Running on TPU grpc://10.0.0.2:8470
REPLICAS: 8


In [5]:
BATCH_SIZE = 512
MAX_LEN = 250

## Loading data

In [9]:
train = pd.read_csv('../input/amazon/new_train.csv', 
                    usecols=['TITLE', 'BULLET_POINTS', 'label3', 'split3'])
train.fillna('nan', inplace=True)

num_classes = train['label3'].nunique()

## Prepraing dataset

In [10]:
le = LabelEncoder()
train['label3'] = le.fit_transform(train['label3'])

train_df, valid_df = train[train['split3'] == 'train'], train[train['split3'] == 'valid']
train_df['text'] = train_df['TITLE'] + " " + train_df['BULLET_POINTS']
valid_df['text'] = valid_df['TITLE'] + " " + valid_df['BULLET_POINTS']

train_text = train_df['text'].values
valid_text = valid_df['text'].values

train_label = train_df['label3'].values
valid_label = valid_df['label3'].values

In [8]:
tokenizer = transformers.RobertaTokenizerFast.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

## Creating model

In [11]:
def create_model():
    bert = transformers.TFRobertaModel.from_pretrained('roberta-base', 
                                                    output_attentions=False, 
                                                    output_hidden_states=False)
    inputs = tf.keras.layers.Input((None,), dtype=tf.int32)
    mask = tf.keras.layers.Input((None,), dtype=tf.int32)
    bert_outputs = bert(inputs, attention_mask=mask)[1]
    preds = tf.keras.layers.Dense(num_classes, activation='softmax')(bert_outputs)
    return tf.keras.Model([inputs, mask], preds)

## Training Function

In [12]:
print('creating model')
with strategy.scope():
    model = create_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
    loss_fun = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer, loss=loss_fun, metrics=['acc'])

creating model


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [13]:
def train_chunk(start, end):
  
    valid_idx = np.random.randint(0, len(valid_text)-1, size=25000)

    valid_text_chunk = list(valid_text[valid_idx])
    train_text_chunk = list(train_text[start:end])

    train_label_chunk = train_label[start:end]
    valid_label_chunk = valid_label[valid_idx]

    start = time.time()
    print('Tokenizing data')
    train_data = tokenizer(train_text_chunk, padding=True, truncation=True, max_length=MAX_LEN)
    valid_data = tokenizer(valid_text_chunk, padding=True, truncation=True, max_length=MAX_LEN)
    print('Tokenization complete, time taken = ', time.time() - start)

    start = time.time()
    print('creating dataset')
    train_dataset = tf.data.Dataset.from_tensor_slices(((train_data['input_ids'], 
                                                       train_data['attention_mask']), 
                                                       train_label_chunk)).batch(BATCH_SIZE)

    valid_dataset = tf.data.Dataset.from_tensor_slices(((valid_data['input_ids'], 
                                                       valid_data['attention_mask']), 
                                                       valid_label_chunk)).batch(BATCH_SIZE)
    print('dataset created, time taken = ', time.time() - start)

    print('TRAINING')
    model.fit(train_dataset,
            epochs=2,
            validation_data=valid_dataset)

    print('Saving weights')
    model.save_weights('./tpu_model_roberta.h5')                                            

## Training the model

In [14]:
len(train_df)

2123582

In [15]:
start = 0
for i in range(100000, 2000000, 100000):
    train_chunk(start, i)
    start = i

Tokenizing data
Tokenization complete, time taken =  27.52086901664734
creating dataset
dataset created, time taken =  211.2838213443756
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  26.5440616607666
creating dataset
dataset created, time taken =  211.30549120903015
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  26.593314170837402
creating dataset
dataset created, time taken =  210.4616186618805
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  26.633644342422485
creating dataset
dataset created, time taken =  210.57513117790222
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  25.61212992668152
creating dataset
dataset created, time taken =  211.98081827163696
TRAINING
Epoch 1/2
Epoch 2/2
Saving weights
Tokenizing data
Tokenization complete, time taken =  25.70943808555603
creating dataset
dat

## Predicting on test set

In [16]:
model.load_weights('./tpu_model_roberta.h5')
test = pd.read_csv('../input/amazontest/test.csv', escapechar='\\', quoting=csv.QUOTE_NONE)
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [17]:
test.fillna('nan', inplace=True)
len(test)

110775

In [18]:
test_text = test['TITLE'] + " " + test['BULLET_POINTS']

In [19]:
test_data = tokenizer(list(test_text.values), padding=True, truncation=True, max_length=MAX_LEN)

In [20]:
test_dataset = tf.data.Dataset.from_tensor_slices(((test_data['input_ids'], 
                                                    test_data['attention_mask']), 
                                                   test['PRODUCT_ID'].values)).batch(BATCH_SIZE)

In [23]:
product_ids = []
sample_preds = []
for i in tqdm(test_dataset):
    product_ids += list(i[1].numpy())
    sample_preds += list(np.argmax(model.predict(i[0]), axis=1))

  0%|          | 0/217 [00:00<?, ?it/s]

In [25]:
train = pd.read_csv('../input/amazon/new_train.csv', 
                    usecols=['label3'])
train.fillna('nan', inplace=True)

In [26]:
le = LabelEncoder()
transformed = le.fit_transform(train['label3'])

In [27]:
inverse = {}

for i, j in zip(transformed, train['label3']):
    inverse[i] = j

In [28]:
# Temp sub
sub = pd.DataFrame()
sub['PRODUCT_ID'] = product_ids
sub['BROWSE_NODE_ID'] = sample_preds
sub['BROWSE_NODE_ID'] = sub['BROWSE_NODE_ID'].map(inverse)

In [29]:
sub.to_csv('./RobertaSubmission.csv', index=False)

In [30]:
sub.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,3329
2,3,0
3,4,125
4,5,8915
