In [None]:
from transformers import TransfoXLTokenizer, TFTransfoXLForSequenceClassification
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn import model_selection
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('transfo-xl-wt103')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained('transfo-xl-wt103',num_labels=2)
model.config.pad_token_id = model.config.eos_token_id

In [None]:
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)
train_x = np.array(train_x)
train_y = np.array(train_y)

tokenised_train_x = []

for elt in train_x:
    tokenised_train_x.append(tokenizer(elt,padding=True)['input_ids'])

In [None]:
print(tokenised_train_x[0])

In [None]:
dataset = []
for i in range(len(train_x)):
    dataset.append({"label" : train_y[i], "text" : train_x[i]})


dataset = pd.DataFrame(dataset)
dataset = Dataset.from_pandas(dataset)

def preprocess_function(examples):
    return tokenizer(examples["text"],padding=False)

tokenised_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
tokenised_dataset

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,return_tensors="tf")

tf_train_dataset = tokenised_dataset.to_tf_dataset(
    columns=['input_ids', 'label','text'],
    label_cols=['label'],
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

In [None]:
from transformers import create_optimizer

num_epochs = 4
batches_per_epoch = 3040 // 1
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)
model.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=[metric])

In [None]:
history=model.fit(tf_train_dataset,batch_size=1,epochs=4,verbose=1)