In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

In [None]:
from transformers import AutoTokenizer,DataCollatorWithPadding,TFAutoModelForSequenceClassification
import datasets
import tensorflow as tf
import numpy as np

In [None]:
checkpoint="google/electra-small-discriminator"
path="/content/drive/MyDrive/Models/electra"


tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model=TFAutoModelForSequenceClassification.from_pretrained(path)

In [None]:
dataset= datasets.load_dataset("alisawuffles/WANLI")

In [None]:
dataset

In [None]:
def tokenize_function(example):
  # Tokenize sentences
  tokenized_inputs = tokenizer(example["premise"], example["hypothesis"], truncation=True)
  return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
trainset=tokenized_datasets["train"]
validset=tokenized_datasets["test"]

In [None]:
trainset[0]

In [None]:
label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}

labels=[]
for i in range(len(trainset)):
  labels.append(label_to_int[trainset[i]['gold']])
trainset=trainset.add_column('label',labels)

labels=[]
for i in range(len(validset)):
  labels.append(label_to_int[validset[i]['gold']])
validset=validset.add_column('label',labels)

In [None]:
trainset

In [None]:
batchsize=32
numepochs=3

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_dataset = trainset.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batchsize
)

tf_validation_dataset = validset.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batchsize
)

In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * numepochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=2e-5, end_learning_rate=0.5e-5, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

model.compile(
    optimizer=opt,
    metrics=["accuracy"]
)

In [None]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=numepochs)

In [None]:
dataset= datasets.load_dataset("multi_nli")

In [None]:
def tokenize_function(example):
  # Tokenize sentences
  tokenized_inputs = tokenizer(example["premise"], example["hypothesis"], truncation=True)
  return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets=tokenized_datasets.filter(lambda x : x['label']!=-1)

In [None]:
trainset=tokenized_datasets["train"]
validset=tokenized_datasets["validation_matched"]

In [None]:
batchsize=32
numepochs=3

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_dataset = trainset.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batchsize
)

tf_validation_dataset = validset.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batchsize
)

In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

num_train_steps = len(tf_train_dataset) * numepochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=2e-5, end_learning_rate=0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

model.compile(
    optimizer=opt,
    metrics=["accuracy"]
)

In [None]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=numepochs)

In [None]:
def get_predict(sent1,sent2):
  inputs=tokenizer(sent1,sent2, return_tensors="tf")
  return model(**inputs).logits


In [None]:
piercontest="""
bob tom and lucy are having fun. they did not have any argument. what happened was not upon them.
there is a storm and the boat is wrecked. bob, tom, lucy cast away on an island.
  """.replace("\n","")

In [None]:
question=" lucy is dead"
response=(get_predict(piercontest,question))
print(f'Passage: pier\nQuestion: {question}')
print(tf.math.softmax(response))
print(model.config.id2label[np.argmax(response)])

In [None]:
tokenizer("bob tom")

In [None]:
model.save_pretrained(path)