In [None]:
%%bash 
wget https://storage.googleapis.com/tunbert-opensource-datasets/TADI_dataset/TADI_train.tsv 
wget https://storage.googleapis.com/tunbert-opensource-datasets/TADI_dataset/TADI_valid.tsv
wget https://storage.googleapis.com/tunbert-opensource-datasets/TADI_dataset/TADI_test.tsv
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.8.1 pyyaml-6.0 tokenizers-0.12.1 transformers-4.20.1


--2022-06-27 07:52:57--  https://storage.googleapis.com/tunbert-opensource-datasets/TADI_dataset/TADI_train.tsv
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.183.128, 173.194.194.128, 173.194.192.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.183.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4401107 (4.2M) [text/tab-separated-values]
Saving to: ‘TADI_train.tsv’

     0K .......... .......... .......... .......... ..........  1% 53.8M 0s
    50K .......... .......... .......... .......... ..........  2% 77.3M 0s
   100K .......... .......... .......... .......... ..........  3% 79.4M 0s
   150K .......... .......... .......... .......... ..........  4% 29.1M 0s
   200K .......... .......... .......... .......... ..........  5% 26.5M 0s
   250K .......... .......... .......... .......... ..........  6% 31.6M 0s
   300K .......... .......... .......... .......... ..........  8% 25.0M 0s
   350K ..........

In [None]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModel, AutoConfig
import tensorflow as tf
from sklearn.metrics import f1_score,accuracy_score,accuracy_score,recall_score

In [None]:
train=pd.read_csv("/content/TADI_train.tsv",sep="\t")
valid=pd.read_csv("/content/TADI_valid.tsv",sep="\t") 
test=pd.read_csv("/content/TADI_test.tsv",sep="\t")

In [None]:
class cfg: 
  pretrained_model="UBC-NLP/MARBERTv2"
  input_shape=128
  learning_rate= 1e-4
  batch_size=128
  epochs=12

In [None]:
def get_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experi3mental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        auto = tf.data.experimental.AUTOTUNE
        replicas = strategy.num_replicas_in_sync
        print(f'TPU: {tpu.master()}')
    except:
        strategy = tf.distribute.get_strategy()
        auto = tf.data.experimental.AUTOTUNE
        replicas = strategy.num_replicas_in_sync

    tf.config.optimizer.set_jit(True)
    print(f'Replicas: {replicas}')

    return strategy

strategy=get_strategy()

Replicas: 1


In [None]:
def tokenize(sentences):
    input_ids, attention_mask = [],[]
    tokenizer=AutoTokenizer.from_pretrained(cfg.pretrained_model)
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True,max_length=cfg.input_shape,truncation=True, padding='max_length',return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])
    return {"input_ids":tf.convert_to_tensor(input_ids),"attention_masks":tf.convert_to_tensor(attention_mask)}

In [None]:
def create_model():
    with strategy.scope():
      config =AutoConfig.from_pretrained(cfg.pretrained_model,dropout=0.3,seed=3,attention_dropout=0.3,output_hidden_states = True)
      transformer= TFAutoModel.from_pretrained(cfg.pretrained_model)
      input_ids= tf.keras.layers.Input(shape=(cfg.input_shape,), dtype='int32',name="input_ids")
      input_masks = tf.keras.layers.Input(shape=(cfg.input_shape,), dtype='int32',name="attention_masks")
      embedding_layer=transformer(input_ids, attention_mask=input_masks)[0][:,0,:]
      output=tf.keras.layers.Dense(1, activation="sigmoid",name="output")(embedding_layer)
                                                  
      model = tf.keras.Model(inputs=[input_ids, input_masks], outputs = output)
      
    return model                                    

In [None]:
def make_dataset(data):
    inputs=tokenize(data.sentence.values.astype(str))
    target=data.label
    train = tf.data.Dataset.from_tensor_slices(((inputs["input_ids"],inputs["attention_masks"]),target)).batch(cfg.batch_size)
    return train

train_ds=make_dataset(train)
valid_ds=make_dataset(valid)
test_ds=make_dataset(test)

In [None]:
with strategy.scope():

  model=create_model()
  checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model.h5',monitor='val_loss',mode='min',save_best_only=True,save_weights_only=True,save_freq='epoch',verbose=1)
  model.compile(loss={"output":tf.keras.losses.BinaryCrossentropy()},
                      optimizer=tf.keras.optimizers.Adam(cfg.learning_rate))
  
  model.fit(train_ds,epochs=cfg.epochs,verbose=1,validation_data = valid_ds,callbacks = [checkpoint])


Downloading:   0%|          | 0.00/757 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/621M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at UBC-NLP/MARBERTv2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/12


In [None]:
def report_gen(predictions,labels):
    report={
    "F1_macro":f1_score(predictions,labels,average="macro"),
    "Accuracy":accuracy_score(predictions,labels),
    "Precision_macro":accuracy_score(predictions,labels,average="macro"),
    "Recall_macro":recall_score(predictions,labels,average="macro")
    } 
    return report

predictions=model.predict(test_ds)
print(report_gen(predictions,test["label"]))