## Importing

In [None]:
# !pip install transformers -q

In [None]:
import os
import csv
import time
import warnings

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import transformers
import tensorflow_hub as hub

from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

## Config

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
BATCH_SIZE = 256
MAX_LEN = 250

## Loading data

In [None]:
train = pd.read_csv('../input/amazon/new_train.csv', 
                    usecols=['TITLE', 'BULLET_POINTS', 'label3', 'split3'])
train.fillna('nan', inplace=True)

num_classes = train['label3'].nunique()

In [None]:
temp = train.label3.value_counts()
value_counts = {}
for i, j in zip(temp.index, temp):
    value_counts[i] = j

In [None]:
train['Threshold'] = train['label3'].map(lambda x: 1 if value_counts[x] >= 500 else 0)
train['label'] = train['label3'] * train['Threshold']

new_df = train[train['Threshold'] == 1]
new_df = new_df.sample(frac=1, random_state=1234).reset_index(drop=True)

# label encoding
le = LabelEncoder()
le.fit(new_df['label'])
transformed = le.transform(new_df['label'])

inverse = {}
for i, j in zip(transformed, new_df['label'].values):
    inverse[i] = j

new_df['label'] = transformed
num_classes = new_df['label'].nunique()

## Prepraing dataset

In [None]:
train_df, valid_df = train_test_split(new_df, stratify=new_df['label'], random_state=1234)
train_df['text'] = train_df['TITLE'] + " " + train_df['BULLET_POINTS']
valid_df['text'] = valid_df['TITLE'] + " " + valid_df['BULLET_POINTS']

train_text = train_df['text'].values
valid_text = valid_df['text'].values

train_label = train_df['label'].values
valid_label = valid_df['label'].values

In [None]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-large-uncased')

## Creating model

In [None]:
def create_model():
    bert = transformers.TFBertModel.from_pretrained('bert-large-uncased', 
                                                    output_attentions=False, 
                                                    output_hidden_states=False)
    inputs = tf.keras.layers.Input((None,), dtype=tf.int32)
    mask = tf.keras.layers.Input((None,), dtype=tf.int32)
    bert_outputs = bert(inputs, attention_mask=mask)['pooler_output']
    preds = tf.keras.layers.Dense(num_classes, activation='softmax')(bert_outputs)
    return tf.keras.Model([inputs, mask], preds)

## Training Function

In [None]:
print('creating model')
with strategy.scope():
    model = create_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005, epsilon=1e-08)
    loss_fun = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer, loss=loss_fun, metrics=['acc'])

In [None]:
def train_chunk(start, end):
  
    valid_idx = np.random.randint(0, len(valid_text)-1, size=25000)

    valid_text_chunk = list(valid_text[valid_idx])
    train_text_chunk = list(train_text[start:end])

    train_label_chunk = train_label[start:end]
    valid_label_chunk = valid_label[valid_idx]

    start = time.time()
    print('Tokenizing data')
    train_data = tokenizer(train_text_chunk, padding=True, truncation=True, max_length=MAX_LEN)
    valid_data = tokenizer(valid_text_chunk, padding=True, truncation=True, max_length=MAX_LEN)
    print('Tokenization complete, time taken = ', time.time() - start)

    start = time.time()
    print('creating dataset')
    train_dataset = tf.data.Dataset.from_tensor_slices(((train_data['input_ids'], 
                                                       train_data['attention_mask']), 
                                                       train_label_chunk)).batch(BATCH_SIZE)

    valid_dataset = tf.data.Dataset.from_tensor_slices(((valid_data['input_ids'], 
                                                       valid_data['attention_mask']), 
                                                       valid_label_chunk)).batch(BATCH_SIZE)
    print('dataset created, time taken = ', time.time() - start)

    print('TRAINING')
    model.fit(train_dataset,
            epochs=2,
            validation_data=valid_dataset)

    print('Saving weights')
    model.save_weights('./tpu_model_alberta.h5')                                            

## Training the model

In [None]:
len(train_df)

In [None]:
start = 0
for i in range(100000, 2000000, 100000):
    train_chunk(start, i)
    start = i

## Predicting on test set

In [None]:
model.load_weights('./tpu_model_alberta.h5')
test = pd.read_csv('../input/amazontest/test.csv', escapechar='\\', quoting=csv.QUOTE_NONE)
test.head()

In [None]:
test.fillna('nan', inplace=True)
len(test)

In [None]:
test_text = test['TITLE'] + " " + test['BULLET_POINTS']

In [None]:
test_data = tokenizer(list(test_text.values), padding=True, truncation=True, max_length=MAX_LEN)

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices(((test_data['input_ids'], 
                                                    test_data['attention_mask']), 
                                                   test['PRODUCT_ID'].values)).batch(BATCH_SIZE)

In [None]:
product_ids = []
sample_preds = []
for i in tqdm(test_dataset):
    product_ids += list(i[1].numpy())
    sample_preds += list(np.argmax(model.predict(i[0]), axis=1))

In [None]:
train = pd.read_csv('../input/amazon/new_train.csv', 
                    usecols=['label3'])
train.fillna('nan', inplace=True)

In [None]:
le = LabelEncoder()
transformed = le.fit_transform(train['label3'])

In [None]:
inverse = {}

for i, j in zip(transformed, train['label3']):
    inverse[i] = j

In [None]:
# Temp sub
sub = pd.DataFrame()
sub['PRODUCT_ID'] = product_ids
sub['BROWSE_NODE_ID'] = sample_preds
sub['BROWSE_NODE_ID'] = sub['BROWSE_NODE_ID'].map(inverse)

In [None]:
sub.to_csv('./RobertaSubmission.csv', index=False)

In [None]:
sub.head()