In [78]:
import warnings 
warnings.filterwarnings("ignore")

In [90]:
from datasets import DatasetDict

dataset = DatasetDict.load_from_disk("../Data/Anxiety_Detection_Data/anxiety_not_augmented__not_anxiety_hg_dataset")

classes = ['agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}
# !huggingface-cli login --token=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
# dataset = load_dataset('halilibr/dilbazlar-anxiety-disorders-recognition-not-augmented-dataset') 

In [91]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 12105
    })
    test: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 3616
    })
})

In [92]:
from transformers import AutoTokenizer

model_path = 'dbmdz/bert-base-turkish-128k-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [93]:
def preprocess_function(example):
    
    text = f"{example['text']}"

    
    all_labels = example['all_labels']
    labels = [0. for i in range(len(classes))]

    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1.

    example = tokenizer(text, truncation=True)
    example['labels'] = labels
    return example

tokenized_dataset = dataset.map(preprocess_function)


Map:   0%|          | 0/12105 [00:00<?, ? examples/s]

Map:   0%|          | 0/3616 [00:00<?, ? examples/s]

In [94]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [95]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [96]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
import os
os.environ["WANDB_PROJECT"]="Dilbazlar"
wandb_api_key = "04a083b14d60688b24482e00727ebcc57448ef88"

In [101]:
training_args = TrainingArguments(
   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   report_to=["wandb"],# Wandb = https://docs.wandb.ai/guides/integrations/huggingface
   run_name="anxiety-specific-first-model"
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2048,0.174185,0.938662,0.843693,0.860305,0.82771
2,0.1277,0.200599,0.942588,0.855713,0.860257,0.851217


TrainOutput(global_step=8070, training_loss=0.1814610938600448, metrics={'train_runtime': 998.7555, 'train_samples_per_second': 24.24, 'train_steps_per_second': 8.08, 'total_flos': 1103711744685132.0, 'train_loss': 0.1814610938600448, 'epoch': 2.0})

In [102]:
classes

['agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety']

In [103]:
class2id

{'agoraphobia': 0,
 'panic': 1,
 'phobia': 2,
 'selectivemutism': 3,
 'socialanxiety': 4}

In [112]:
dataset['test'][200]

{'text': 'Çıldırdım, direği kavradım ve yüzme rotasında başarısız oldum... Bacağımda orta büyüklükte bir örümcek bulmak için battaniyeyi açtım... Aynı şeyin tekrar olmasına çok paranoyaktım... 3 mil yarıçapındaki herkesin çığlığı duyduğundan oldukça eminim... Bu, araknofobide ve entomofobinin yaratılmasında yeniden uyanmaya neden oldu... Hemen korkuyla fırlayacağım ve adrenalin bir artış elde edeceğim ve korku o kadar da kötü bir görünüm alamadım.',
 'agoraphobia': 0,
 'panic': 0,
 'phobia': 1,
 'selectivemutism': 0,
 'socialanxiety': 0,
 'all_labels': ['phobia'],
 'source': 'Reddit'}

In [113]:
import torch 

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
text = ""

# Prepare the text input
inputs = trainer.tokenizer(text, return_tensors="pt").to(device)

# Make the prediction
with torch.no_grad():
    outputs = trainer.model(**inputs)
    print(outputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Process the predictions
predicted_label = predictions.item()
print(f"Predicted label: {classes[predicted_label]}")

SequenceClassifierOutput(loss=None, logits=tensor([[-4.3928, -4.3542,  3.7974, -4.7848, -4.7771]], device='cuda:0'), hidden_states=None, attentions=None)
Predicted label: phobia


## Push To Hub

In [114]:
!huggingface-cli login --token=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

trainer.model.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-tr-model-acc-94")
trainer.tokenizer.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-tr-model-acc-94")
print("It was published :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


model.safetensors:   0%|          | 0.00/737M [00:00<?, ?B/s]

It was published :)
