In [12]:
import warnings 
warnings.filterwarnings("ignore")

In [28]:
from datasets import DatasetDict

dataset = DatasetDict.load_from_disk("../Data/Anxiety_Detection_Data/anxiety_not_augmented_hg_dataset")

classes = ['agoraphobia', 'anxiety', 'panic', 'phobia', 'selectivemutism', 'socialanxiety']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}
# !huggingface-cli login --token=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
# dataset = load_dataset('halilibr/dilbazlar-anxiety-disorders-recognition-not-augmented-dataset') 

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'agoraphobia', 'anxiety', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 14740
    })
    test: Dataset({
        features: ['text', 'agoraphobia', 'anxiety', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 4403
    })
})

In [33]:
from transformers import AutoTokenizer

model_path = 'dbmdz/bert-base-turkish-128k-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [34]:
def preprocess_function(example):
    
    text = f"{example['text']}"

    
    all_labels = example['all_labels']
    labels = [0. for i in range(len(classes))]

    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1.

    example = tokenizer(text, truncation=True)
    example['labels'] = labels
    return example

tokenized_dataset = dataset.map(preprocess_function)


Map:   0%|          | 0/14740 [00:00<?, ? examples/s]

Map:   0%|          | 0/4403 [00:00<?, ? examples/s]

In [36]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [41]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [42]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
import os
os.environ["WANDB_PROJECT"]="Dilbazlar"
wandb_api_key = "04a083b14d60688b24482e00727ebcc57448ef88"

In [45]:
training_args = TrainingArguments(
   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   report_to=["wandb"],# Wandb = https://docs.wandb.ai/guides/integrations/huggingface
   run_name="anxiety-specific-first-model"
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.215,0.219915,0.914869,0.73858,0.756429,0.721553
2,0.1506,0.219175,0.921152,0.761096,0.768767,0.753577


TrainOutput(global_step=9828, training_loss=0.20430770364776102, metrics={'train_runtime': 1237.5019, 'train_samples_per_second': 23.822, 'train_steps_per_second': 7.942, 'total_flos': 1369969798470912.0, 'train_loss': 0.20430770364776102, 'epoch': 2.0})

In [51]:
classes

['agoraphobia',
 'anxiety',
 'panic',
 'phobia',
 'selectivemutism',
 'socialanxiety']

In [76]:
class2id

{'agoraphobia': 0,
 'anxiety': 1,
 'panic': 2,
 'phobia': 3,
 'selectivemutism': 4,
 'socialanxiety': 5}

In [75]:
import torch 

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
text = "selamlar"

# Prepare the text input
inputs = trainer.tokenizer(text, return_tensors="pt").to(device)

# Make the prediction
with torch.no_grad():
    outputs = trainer.model(**inputs)
    print(outputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Process the predictions
predicted_label = predictions.item()
print(f"Predicted label: {classes[predicted_label]}")

SequenceClassifierOutput(loss=None, logits=tensor([[-3.6759, -5.6864, -5.9880, -5.8816,  3.3758, -3.6727]],
       device='cuda:0'), hidden_states=None, attentions=None)
Predicted label: selectivemutism


## Push To Hub

In [77]:
!huggingface-cli login --token=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

trainer.model.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-tr-model-acc-92")
trainer.tokenizer.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-tr-model-acc-92")
print("It was published :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


model.safetensors:   0%|          | 0.00/737M [00:00<?, ?B/s]

It was published :)
