In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from transformers import TFAutoModelWithLMHead, TFXLMRobertaForMaskedLM, TFXLMRobertaForTokenClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
import os

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base') #'jplu/tf-xlm-roberta-base'

model = TFXLMRobertaForMaskedLM.from_pretrained('roberta-base')

In [None]:
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy')

In [None]:
seq_len = 10
batch_size = 10
epochs = 2

In [None]:
os.getcwd()

In [None]:
os.chdir('/kaggle/input/shopee-code-league-20/_DS_Title_Translation')

In [None]:
os.listdir()

In [None]:
shopee_data = pd.concat([pd.read_csv("dev_tcn.csv").drop(columns = ["split"]), pd.read_csv("dev_en.csv")], axis = 1)

In [None]:
shopee_data

In [None]:
X = np.array(tokenizer.batch_encode_plus(shopee_data.text[:100], return_attention_masks=False, pad_to_max_length= True, max_length= seq_len)['input_ids'])

In [None]:
y = np.array(tokenizer.batch_encode_plus(shopee_data.translation_output[:100], return_attention_masks=False, pad_to_max_length= True, max_length= seq_len)['input_ids'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
X

In [None]:
y 
#We see that regardless of the encoded language, te start token is 0, and the end token is 2
#In our code, if we want to predict one word of the sentence at a time instead of the entire sentence at once,
#we just need to search for when the model outputs 2, and that will be the end of our sentence

In [None]:
test_res = model.predict(X[0].reshape((1,seq_len)))[0] 
#when predicting for a single value, must reshape the array to (1, seq_len) so that the model doesnt see shape (seq_len,) and assume we are
    #predicting seq_len different sentences

In [None]:
test_res.shape #shape = (batch_size, seq_len, vocab_size)

In [None]:
np.argmax(test_res, axis = 2) #This line of code returns the output predictions. We can use the tokenizer to decode this

In [None]:
train_dataset = (tf.data.Dataset
                     .from_tensor_slices((X_train, y_train))
                     .repeat()
                     .shuffle(100)
                     .batch(batch_size))

test_dataset = (tf.data.Dataset
                     .from_tensor_slices((X_test, y_test))
                     .shuffle(100)
                     .batch(batch_size))

In [None]:
model.fit(train_dataset, steps_per_epoch = X_train.shape[0], epochs = epochs, validation_data = test_dataset)
#currently the code is throwing 'UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume 
    #a large amount of memory.'
    
    #Note quite sure why this is happening, seems this is an internal problem of Hugging Face Transformers' implementation

In [None]:
#It would be easy to add more sentences, increase the seq_len, etc. The code for how to make predictions has also been demonstrated.
    #Using the model itself as a direct translator isn't hard per se, and training for sufficiently long should yield a decent translator.
    #That being said, more optimization is necessary to build a better translator.

In [None]:
sample_sentence = X_test[0]
pred = model.predict(sample_sentence.reshape((1,seq_len)))[0]

In [None]:
decoded_pred = tokenizer.decode(np.argmax(pred, axis = 2).reshape((seq_len,)))

In [None]:
decoded_pred #The output doesn't make sense right now since I set seq_len to only 10 and only used 100 samples for code demonstration
    #purposes. As we can see, this line of code is how we would get the decoded predictions.