In [81]:
import warnings 
warnings.filterwarnings("ignore")

In [7]:
from datasets import DatasetDict, load_dataset

dataset = DatasetDict.load_from_disk("../Data/Anxiety_Detection_Data/anxiety_not_augmented__not_anxiety_hg_dataset")

classes = ['agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

# !huggingface-cli login --token=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
# dataset = load_dataset('halilibr/dilbazlar-anxiety-disorders-recognition-not-augmented-not-anxiety-multilabel-tr-dataset', use) 

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 12105
    })
    test: Dataset({
        features: ['text', 'agoraphobia', 'panic', 'phobia', 'selectivemutism', 'socialanxiety', 'all_labels', 'source'],
        num_rows: 3616
    })
})

In [20]:
from transformers import AutoTokenizer

model_path = 'FacebookAI/xlm-roberta-large'

tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [21]:
def preprocess_function(example):
    
    text = f"{example['text']}"

    
    all_labels = example['all_labels']
    labels = [0. for i in range(len(classes))]

    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1.

    example = tokenizer(text, truncation=True)
    example['labels'] = labels
    return example

tokenized_dataset = dataset.map(preprocess_function)


Map:   0%|          | 0/12105 [00:00<?, ? examples/s]

Map:   0%|          | 0/3616 [00:00<?, ? examples/s]

In [22]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [24]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
import os
os.environ["WANDB_PROJECT"]="Dilbazlar"
wandb_api_key = "04a083b14d60688b24482e00727ebcc57448ef88"

In [26]:
training_args = TrainingArguments(
   output_dir="anxiety_disorders_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   report_to=["wandb"],# Wandb = https://docs.wandb.ai/guides/integrations/huggingface
   run_name="anxiety-specific-augmented-model"
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss



KeyboardInterrupt

[34m[1mwandb[0m: Encountered an error while tearing down the service manager: [WinError 10054] An existing connection was forcibly closed by the remote host


In [None]:
classes

In [None]:
class2id

In [None]:
dataset['test'][200]

In [None]:
import torch 

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def predict(text):
    # Prepare the text input
    inputs = trainer.tokenizer(text, return_tensors="pt").to(device)
    
    # Make the prediction
    with torch.no_grad():
        outputs = trainer.model(**inputs)
        print(outputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    
    # Process the predictions
    predicted_label = predictions.item()
    print(f"Predicted label: {classes[predicted_label]}")

In [29]:
panic_sentences = [
    "Kalbim çok hızlı atıyor, bir şey mi oluyor?",
    "Nefes alamıyorum, boğuluyormuş gibi hissediyorum.",
    "Başım dönüyor, bayılacak gibiyim.",
    "Sürekli kötü bir şey olacakmış gibi hissediyorum.",
    "Göğsümde bir ağrı var, kalp krizi mi geçiriyorum?",
    "Ellerim titriyor, kontrol edemiyorum.",
    "Sanki her şey üzerime geliyormuş gibi hissediyorum.",
    "Çok fazla terliyorum, vücudum ateş gibi yanıyor.",
    "Etrafımdaki her şey bulanıklaşıyor, gerçek değilmiş gibi.",
    "Bunu atlatamayacağım, buradan çıkmalıyım."
]

for text in panic_sentences:
    print(text)
    print(predict(text))
    print("*"*50)

Kalbim çok hızlı atıyor, bir şey mi oluyor?
SequenceClassifierOutput(loss=None, logits=tensor([[-2.0979,  0.9903, -2.9875, -4.4397, -6.9021]], device='cuda:0'), hidden_states=None, attentions=None)
Predicted label: panic
None
**************************************************
Nefes alamıyorum, boğuluyormuş gibi hissediyorum.
SequenceClassifierOutput(loss=None, logits=tensor([[-3.3000,  1.8684, -2.4475, -3.4733, -6.9446]], device='cuda:0'), hidden_states=None, attentions=None)
Predicted label: panic
None
**************************************************
Başım dönüyor, bayılacak gibiyim.
SequenceClassifierOutput(loss=None, logits=tensor([[-1.5482, -0.2935, -1.0072, -1.7374, -6.2352]], device='cuda:0'), hidden_states=None, attentions=None)
Predicted label: panic
None
**************************************************
Sürekli kötü bir şey olacakmış gibi hissediyorum.
SequenceClassifierOutput(loss=None, logits=tensor([[-1.6660, -0.5807, -0.5266, -1.8419, -6.1230]], device='cuda:0'), hidden

In [36]:
predictions[0].shape

(11408, 5)

array([2, 4, 1, ..., 4, 2, 2], dtype=int64)

In [47]:
np.argmax(predictions[1], axis=1)[:20]

array([2, 4, 1, 3, 3, 1, 4, 3, 4, 0, 2, 4, 0, 0, 2, 4, 1, 1, 1, 4],
      dtype=int64)

In [46]:
np.argmax(predictions.label_ids, axis=1)[:20]

array([2, 4, 1, 3, 3, 1, 4, 3, 4, 0, 2, 4, 0, 0, 2, 4, 1, 1, 1, 4],
      dtype=int64)

2

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# Step 1: Evaluate the model and get predictions
predictions = trainer.predict(tokenized_dataset["test"])

# Step 2: Extract the true labels and predictions
true_labels = [np.argmax(tokenized_dataset['test'][i]['labels']) for i in range(len(tokenized_dataset['test']))]

# Step 3: Create the confusion matrix
conf_matrix = confusion_matrix(true_labels, np.argmax(predictions[0],  axis=1))

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot()


In [None]:
classes

## Push To Hub

In [79]:
!huggingface-cli login --token=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

trainer.model.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-tr-model-cleaned-agoraphobia-acc-92")
trainer.tokenizer.push_to_hub("halilibr/dilbazlar-anxiety-disorders-recognition-tr-model-cleaned-agoraphobia-acc-92")
print("It was published :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


model.safetensors:   0%|          | 0.00/737M [00:00<?, ?B/s]

It was published :)
