In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 6.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [None]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModel, AutoConfig
import tensorflow as tf
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
from sklearn.model_selection import StratifiedShuffleSplit


In [None]:
data=pd.read_csv("/content/aug_dataset.csv")
data["label"]=data["class"].map({"normal":0,"abusive":1,"hate":2})
data["class"].value_counts()

normal     2300
abusive    2028
hate       1724
Name: class, dtype: int64

In [None]:
def strat_train_test_split(data,target,rate=0.1):
    split=StratifiedShuffleSplit(n_splits=1,test_size=rate,random_state=41)
    for train_index,test_index in split.split(data,data[target]):
        train_set=data.loc[train_index]
        test_set=data.loc[test_index]
    return train_set,test_set

train_set,test_set=strat_train_test_split(data,"label",0.2)
train_set=train_set.reset_index()
train_set,valid_set=strat_train_test_split(train_set,"label",0.1)

In [None]:
class cfg: 
  pretrained_model="UBC-NLP/ARBERT"
  input_shape=128
  learning_rate= 1e-4
  batch_size=32
  epochs=9

In [None]:
def get_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experi3mental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        auto = tf.data.experimental.AUTOTUNE
        replicas = strategy.num_replicas_in_sync
        print(f'TPU: {tpu.master()}')
    except:
        strategy = tf.distribute.get_strategy()
        auto = tf.data.experimental.AUTOTUNE
        replicas = strategy.num_replicas_in_sync

    tf.config.optimizer.set_jit(True)
    print(f'Replicas: {replicas}')

    return strategy

strategy=get_strategy()

Replicas: 1


In [None]:
def tokenize(sentences):
    input_ids, attention_mask = [],[]
    tokenizer=AutoTokenizer.from_pretrained(cfg.pretrained_model)
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True,max_length=cfg.input_shape,truncation=True, padding='max_length',return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])
    return {"input_ids":tf.convert_to_tensor(input_ids),"attention_masks":tf.convert_to_tensor(attention_mask)}

def make_dataset(data):
    inputs=tokenize(data.text.values.astype(str))
    target=data.label
    train = tf.data.Dataset.from_tensor_slices(((inputs["input_ids"],inputs["attention_masks"]),target)).batch(cfg.batch_size)
    return train

train_ds=make_dataset(train_set)
valid_ds=make_dataset(valid_set)
test_ds=make_dataset(test_set)


Downloading:   0%|          | 0.00/374 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def create_model():
    with strategy.scope():
      config =AutoConfig.from_pretrained(cfg.pretrained_model,dropout=0.3,seed=3,attention_dropout=0.3,output_hidden_states = True)
      transformer= TFAutoModel.from_pretrained(cfg.pretrained_model)
      input_ids= tf.keras.layers.Input(shape=(cfg.input_shape,), dtype='int32',name="input_ids")
      input_masks = tf.keras.layers.Input(shape=(cfg.input_shape,), dtype='int32',name="attention_masks")
      embedding_layer=transformer(input_ids, attention_mask=input_masks)[0][:,0,:]
      # embedding_layer=tf.keras.layers.Flatten()(embedding_layer)
      output=tf.keras.layers.Dense(3, activation="softmax",name="output")(embedding_layer)
                                                 
      model = tf.keras.Model(inputs=[input_ids, input_masks], outputs = output)
      
    return model                                    

In [None]:
with strategy.scope():

  model=create_model()
  checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model.h5',monitor='val_loss',mode='min',save_best_only=True,save_weights_only=True,save_freq='epoch',verbose=1)
  model.compile(loss={"output":tf.keras.losses.SparseCategoricalCrossentropy()},
                      optimizer=tf.keras.optimizers.Adam(cfg.learning_rate))
  
  model.fit(train_ds,epochs=cfg.epochs,verbose=1,validation_data = valid_ds,callbacks = [checkpoint])


Downloading:   0%|          | 0.00/621M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at UBC-NLP/ARBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/9
Epoch 1: val_loss improved from inf to 0.75678, saving model to model.h5
Epoch 2/9
Epoch 2: val_loss did not improve from 0.75678
Epoch 3/9
Epoch 3: val_loss did not improve from 0.75678
Epoch 4/9
Epoch 4: val_loss did not improve from 0.75678
Epoch 5/9
Epoch 5: val_loss did not improve from 0.75678
Epoch 6/9
Epoch 6: val_loss did not improve from 0.75678
Epoch 7/9
Epoch 7: val_loss did not improve from 0.75678
Epoch 8/9
Epoch 8: val_loss did not improve from 0.75678
Epoch 9/9
Epoch 9: val_loss did not improve from 0.75678


In [None]:
def report_gen(predictions,labels):
    report={
    "F1_macro":f1_score(predictions,labels,average="macro"),
    "Accuracy":accuracy_score(predictions,labels),
    "Precision_macro":precision_score(predictions,labels,average="macro"),
    "Recall_macro":recall_score(predictions,labels,average="macro")
    } 
    return report

predictions=model.predict(test_ds)
predictions=tf.argmax(predictions,axis=1)
print(report_gen(predictions,test_set["label"]))

{'F1_macro': 0.6025435451400734, 'Accuracy': 0.6077621800165153, 'Precision_macro': 0.6076568858427929, 'Recall_macro': 0.6198744157595693}
