In [1]:
! pip install -q accelerate datasets evaluate sacremoses

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/270.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m266.2/270.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/507.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [2]:
!pip install git+https://github.com/Dadmatech/DadmaTools.git

Collecting git+https://github.com/Dadmatech/DadmaTools.git
  Cloning https://github.com/Dadmatech/DadmaTools.git to /tmp/pip-req-build-67s2o3sx
  Running command git clone --filter=blob:none --quiet https://github.com/Dadmatech/DadmaTools.git /tmp/pip-req-build-67s2o3sx
  Resolved https://github.com/Dadmatech/DadmaTools.git to commit d84e5f4db24173b3f2bb7a7c768bd21640e13d32
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bpemb>=0.3.3 (from dadmatools==2.0.0)
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting Deprecated==1.2.6 (from dadmatools==2.0.0)
  Downloading Deprecated-1.2.6-py2.py3-none-any.whl (8.1 kB)
Collecting pyconll>=3.1.0 (from dadmatools==2.0.0)
  Downloading pyconll-3.2.0-py3-none-any.whl (27 kB)
Collecting pytorch-transformers>=1.1.0 (from dadmatools==2.0.0)
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0

In [3]:
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
import evaluate
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, EvalPrediction
from transformers.optimization import AdamW
from time import time
from dadmatools.normalizer import Normalizer
from huggingface_hub import notebook_login
from transformers import set_seed


SEED = 42
set_seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

In [13]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
normalizer = Normalizer(
    full_cleaning=False,
    unify_chars=True,
    refine_punc_spacing=True,
    remove_extra_space=True,
    remove_puncs=False,
    remove_html=True,
    remove_stop_word=False,
    replace_email_with="<EMAIL>",
    replace_number_with=None,
    replace_url_with="",
    replace_mobile_number_with=None,
    replace_emoji_with="",
    replace_home_number_with=None
)

In [32]:
classes = ['HAPPY', 'SAD', 'ANGRY', 'FEAR', 'SURPRISE', 'HATE', 'OTHER']
class2id = {classes[i]: i for i in range(len(classes))}
id2class = {i: classes[i] for i in range(len(classes))}

def sigmoid(X):
    return 1 / (1 + np.exp(-X))

def heaviside(X):
    return np.heaviside(X - 0.5, 0)

def onehot(ids, size=len(classes)):
  result = np.zeros((ids.shape[0], size))
  result[np.arange(ids.shape[0]), ids] = 1
  return result

def compute_metrics(eval_preds: EvalPrediction):
  logits, labels = eval_preds.predictions, eval_preds.label_ids
  predictions = onehot(np.argmax(sigmoid(logits), axis=-1))

  f1 = f1_score(labels, predictions, average=None, zero_division=0.0)
  f1 = {f'f1_C{i}': f1[i] for i in range(len(f1))}
  f1_macro = f1_score(labels, predictions, average='macro', zero_division=0.0)
  recall = recall_score(labels, predictions, average=None, zero_division=0.0)
  recall = {f'recall_C{i}': recall[i] for i in range(len(recall))}
  recall_macro = recall_score(labels, predictions, average='macro', zero_division=0.0)
  precision = precision_score(labels, predictions, average=None, zero_division=0.0)
  precision = {f'precision_C{i}': precision[i] for i in range(len(precision))}
  precision_macro = precision_score(labels, predictions, average='macro', zero_division=0.0)
  accuracy = accuracy_score(labels, predictions)
  results = {'accuracy': accuracy, 'precision_macro': precision_macro, 'recall_macro': recall_macro, 'f1_macro': f1_macro, **f1, **recall, **precision}
  return results

In [33]:
ds_url = f'/content/'
ds_files = {
    'train': ds_url + 'pptrain.tsv',
    'test': ds_url + 'pptest.tsv',
}

ds = load_dataset('csv', data_files=ds_files, delimiter='\t')
ds = ds.rename_columns({'ID': 'id', 'Text': 'text', 'Label': 'label'})

ds

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 6125
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1151
    })
})

In [34]:
def convert_labels(example):
  example["label"] = [float(num) for num in example['label'][1:-1].split(' ')]
  return example

def replace_none_with_str(example):
  if example['text'] == None:
    example['text'] = ''
  return example

def remove_hashtag(example):
  while '#' in example['text']:
    example['text'] = example['text'].replace('#', '')
  return example

ds = ds.map(convert_labels)
ds = ds.map(replace_none_with_str)
ds = ds.map(remove_hashtag)
print(ds['train'].features)
print(ds['test'].features)

In [35]:
num_epochs = 5
checkpoint = 'HooshvareLab/bert-base-parsbert-uncased'

In [36]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
  example['text'] = [normalizer.normalize(t) for t in example['text']]
  return tokenizer(example['text'], truncation=True, max_length=512, add_special_tokens=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)


Map:   0%|          | 0/6125 [00:00<?, ? examples/s]

Map:   0%|          | 0/1151 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets['train'].features

{'id': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [45]:
config = AutoConfig.from_pretrained(checkpoint)
config.update({
    'id2label': id2class,
    'label2id': class2id,
    'num_labels': 7
})
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=7)
model.config = config

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
model.config

BertConfig {
  "_name_or_path": "HooshvareLab/bert-base-parsbert-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "HAPPY",
    "1": "SAD",
    "2": "ANGRY",
    "3": "FEAR",
    "4": "SURPRISE",
    "5": "HATE",
    "6": "OTHER"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "ANGRY": 2,
    "FEAR": 3,
    "HAPPY": 0,
    "HATE": 5,
    "OTHER": 6,
    "SAD": 1,
    "SURPRISE": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}

In [56]:
training_args = TrainingArguments(
    run_name=f'First Run-{time()}',
    output_dir='finetuned-parsbert-uncased-ArmanEmo',
    overwrite_output_dir=True,
    auto_find_batch_size=True,
    num_train_epochs=num_epochs,
    evaluation_strategy='epoch',
    eval_steps=512,
    save_strategy='epoch',
    save_steps=512,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    save_safetensors=True,
    group_by_length=True,
    push_to_hub=True,
    hub_model_id='mohammadhabp/finetuned-parsbert-uncased-ArmanEmo',
    hub_strategy='all_checkpoints',
    hub_token='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',
    warmup_steps=500,
    weight_decay=4e-3,
    seed=SEED,
    data_seed=SEED
)


In [57]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [58]:
train_output = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,F1 C0,F1 C1,F1 C2,F1 C3,F1 C4,F1 C5,F1 C6,Recall C0,Recall C1,Recall C2,Recall C3,Recall C4,Recall C5,Recall C6,Precision C0,Precision C1,Precision C2,Precision C3,Precision C4,Precision C5,Precision C6
1,0.3564,0.308483,0.526499,0.592312,0.553024,0.519167,0.278788,0.675573,0.604905,0.696296,0.508475,0.392523,0.477612,0.167273,0.675573,0.720779,0.824561,0.413793,0.323077,0.746114,0.836364,0.675573,0.521127,0.602564,0.659341,0.5,0.35122
2,0.1889,0.253978,0.631625,0.643413,0.620141,0.624656,0.690432,0.655008,0.600707,0.704,0.582677,0.588235,0.551532,0.669091,0.78626,0.551948,0.77193,0.510345,0.538462,0.512953,0.713178,0.561308,0.658915,0.647059,0.678899,0.648148,0.596386
3,0.1052,0.293454,0.6351,0.638887,0.622609,0.624787,0.681648,0.691145,0.620482,0.618182,0.601504,0.587302,0.573248,0.661818,0.610687,0.668831,0.596491,0.551724,0.569231,0.699482,0.702703,0.79602,0.578652,0.641509,0.661157,0.606557,0.485612
4,0.0506,0.401063,0.634231,0.690402,0.625502,0.632684,0.625,0.753153,0.609319,0.701754,0.53913,0.637931,0.5625,0.490909,0.79771,0.551948,0.701754,0.427586,0.569231,0.839378,0.859873,0.713311,0.68,0.701754,0.729412,0.72549,0.422977
5,0.0258,0.425367,0.624674,0.676093,0.62486,0.627774,0.603248,0.736842,0.617544,0.711864,0.563025,0.610169,0.551724,0.472727,0.748092,0.571429,0.736842,0.462069,0.553846,0.829016,0.833333,0.725926,0.671756,0.688525,0.72043,0.679245,0.413437


In [67]:
trainer.save_model()
trainer.push_to_hub('fine tuned model')

CommitInfo(commit_url='https://huggingface.co/mohammadhabp/finetuned-parsbert-uncased-ArmanEmo/commit/770231c1803ea5332e6e5b72d65d8f7aacbed7f6', commit_message='fine tuned model', commit_description='', oid='770231c1803ea5332e6e5b72d65d8f7aacbed7f6', pr_url=None, pr_revision=None, pr_num=None)