# Import libraries

In [None]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


import matplotlib.pyplot as plt
import seaborn as sns

from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff


# Configure and Run Job on TPU

In [None]:
from cloud_tpu_client import Client
c = Client()
c.configure_tpu_version(tf.__version__, restart_type='ifNeeded')
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)


# Load data

In [None]:
train = pd.read_csv('train_v8.csv', lineterminator='\n')
train = train[train['Label'].notna()]

In [None]:
""" Compile data into lists for train/valid split training """
labels = []
texts = []

for line in train.iterrows():
    labels.append(line[1][5])
    texts.append(line[1][4])
    
""" List of all labels """
categories = sorted(list(set(labels)))

""" One hot encoding """
def indicize_labels(labels):
    indices=[]
    for j in range(len(labels)):
        for i in range(len(categories)):
            if labels[j]==categories[i]:
                indices.append(i)
    return indices

# Prepare model for training

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers

from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer
from transformers import RobertaTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-large')
tokenizer.save_pretrained('.')

In [None]:
indices = indicize_labels(labels)

In [None]:
""" Tokenize text, set max-length to 256 words """
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='tf', max_length = 256)

In [None]:
batch_size = 18 * strategy.num_replicas_in_sync
AUTOTUNE = tf.data.experimental.AUTOTUNE

## Split into validation and train (95-5, train-test split)
Can be adapted for cross-validation

In [None]:
dataset=tf.data.Dataset.from_tensor_slices((dict(inputs), indices)) 

val_data_size=int(0.05*len(texts))

val_ds=dataset.take(val_data_size).batch(batch_size, drop_remainder=True) 
train_ds=dataset.skip(val_data_size).batch(batch_size, drop_remainder=True)

train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)

In [None]:
with strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained('roberta-large')
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6, clipnorm=1.), # Tune as needed. 
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.metrics.SparseCategoricalAccuracy()]
    )
 
train_history = model.fit(train_ds, validation_data=val_ds, epochs=3, verbose=1, shuffle=True)

In [None]:
model.save_weights("model.h5")