In [None]:
!pip install transformers
!git clone https://github.com/fbougares/TSAC.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'TSAC' already exists and is not an empty directory.


In [None]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModel, AutoConfig
import tensorflow as tf
from sklearn.metrics import f1_score,accuracy_score,accuracy_score,recall_score
from sklearn.model_selection import train_test_split


In [None]:
pos_filename="/content/TSAC/train_pos.txt"
with open(pos_filename) as file:
    pos_lines = file.readlines()
    pos_lines = [[line.rstrip(),1] for line in pos_lines]

neg_filename="/content/TSAC/train_neg.txt"
with open(neg_filename) as file:
    neg_lines = file.readlines()
    neg_lines = [[line.rstrip(),0] for line in neg_lines]

pos_df=pd.DataFrame(pos_lines,columns=["text","label"])
neg_df=pd.DataFrame(neg_lines,columns=["text","label"])

train=pd.concat([pos_df,neg_df]).sample(frac=1)

X_train, X_test, y_train, y_test = train_test_split(train["text"].values.astype(str), train["label"].values.astype(int), test_size=0.2, random_state=42)

train=pd.DataFrame({"text":X_train,"label":y_train})
valid=pd.DataFrame({"text":X_test,"label":y_test})

In [None]:
pos_filename="/content/TSAC/test_pos.txt"
with open(pos_filename) as file:
    pos_lines = file.readlines()
    pos_lines = [[line.rstrip(),1] for line in pos_lines]

neg_filename="/content/TSAC/test_neg.txt"
with open(neg_filename) as file:
    neg_lines = file.readlines()
    neg_lines = [[line.rstrip(),0] for line in neg_lines]

pos_df=pd.DataFrame(pos_lines,columns=["text","label"])
neg_df=pd.DataFrame(neg_lines,columns=["text","label"])
test=pd.concat([pos_df,neg_df]).sample(frac=1)

In [None]:
class cfg: 
  pretrained_model="UBC-NLP/MARBERTv2"
  input_shape=128
  learning_rate= 1e-4
  batch_size=32
  epochs=9

In [None]:
def get_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experi3mental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        auto = tf.data.experimental.AUTOTUNE
        replicas = strategy.num_replicas_in_sync
        print(f'TPU: {tpu.master()}')
    except:
        strategy = tf.distribute.get_strategy()
        auto = tf.data.experimental.AUTOTUNE
        replicas = strategy.num_replicas_in_sync

    tf.config.optimizer.set_jit(True)
    print(f'Replicas: {replicas}')

    return strategy

strategy=get_strategy()

Replicas: 1


In [None]:
def tokenize(sentences):
    input_ids, attention_mask = [],[]
    tokenizer=AutoTokenizer.from_pretrained(cfg.pretrained_model)
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True,max_length=cfg.input_shape,truncation=True, padding='max_length',return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])
    return {"input_ids":tf.convert_to_tensor(input_ids),"attention_masks":tf.convert_to_tensor(attention_mask)}

In [None]:
def create_model():
    with strategy.scope():
      config =AutoConfig.from_pretrained(cfg.pretrained_model,dropout=0.3,seed=3,attention_dropout=0.3,output_hidden_states = True)
      transformer= TFAutoModel.from_pretrained(cfg.pretrained_model)
      input_ids= tf.keras.layers.Input(shape=(cfg.input_shape,), dtype='int32',name="input_ids")
      input_masks = tf.keras.layers.Input(shape=(cfg.input_shape,), dtype='int32',name="attention_masks")
      embedding_layer=transformer(input_ids, attention_mask=input_masks)[0][:,0,:]
      output=tf.keras.layers.Dense(1, activation="sigmoid",name="output")(embedding_layer)
                                                  
      model = tf.keras.Model(inputs=[input_ids, input_masks], outputs = output)
      
    return model                                    

In [None]:
def make_dataset(data):
    inputs=tokenize(data.text.values)
    target=data.label
    train = tf.data.Dataset.from_tensor_slices(((inputs["input_ids"],inputs["attention_masks"]),target)).batch(cfg.batch_size)
    return train

train_ds=make_dataset(train)
valid_ds=make_dataset(valid)
test_ds=make_dataset(test)

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
with strategy.scope():

  model=create_model()
  checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model.h5',monitor='val_loss',mode='min',save_best_only=True,save_weights_only=True,save_freq='epoch',verbose=1)
  model.compile(loss={"output":tf.keras.losses.BinaryCrossentropy()},
                      optimizer=tf.keras.optimizers.Adam(cfg.learning_rate))
  
  model.fit(train_ds,epochs=cfg.epochs,verbose=1,validation_data = valid_ds,callbacks = [checkpoint])


Downloading:   0%|          | 0.00/621M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at UBC-NLP/MARBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/12


In [None]:
def report_gen(predictions,labels):
    report={
    "F1_macro":f1_score(predictions,labels,average="macro"),
    "Accuracy":accuracy_score(predictions,labels),
    "Precision_macro":accuracy_score(predictions,labels,average="macro"),
    "Recall_macro":recall_score(predictions,labels,average="macro")
    } 
    return report

predictions=model.predict(test_ds)
predictions=[int(i>0.7) for i in predictions]
print(report_gen(predictions,test["label"]))