## Create Dataset

In [12]:
import pandas as pd 

In [21]:
depression_data = pd.read_excel('../Data/Depression_Disorders_Data/depression_total_cleaned.xlsx')
anxiety_data = pd.read_excel("../Data/Anxiety_Detection_Data/total_df_balanced.xlsx")
anxiety_data['labels'] = 'Anxiety'

In [22]:
total_df = pd.DataFrame()

total_df['text'] = depression_data['text'].sample(15000)
total_df['labels'] = 'Depression'

total_df = pd.concat([total_df, anxiety_data.iloc[:, :2]], axis=0)
total_df

Unnamed: 0,text,labels
20670,"Kendimi buradan uzakta, başka bir yerde hayal ...",Depression
11883,Her gün daha da kötüleşiyorum. Bu durumdan kur...,Depression
23773,Hala kendimi 10 yaşında gibi hissediyorum ve o...,Depression
11480,"Belirtilerim aniden o kadar kötü oldu ki, çalı...",Depression
4135,Şu anda hayatımın her yönü için endişeliyim ve...,Depression
...,...,...
13377,sosyal anksiyete diye bir seyi kimse isteyerek...,Anxiety
13378,iyileştiniz mi cok merak ediyorum ilaci tamame...,Anxiety
13379,arkadaşlar bende pani̇k atak anksi̇yete kaygi ...,Anxiety
13380,Ben küçükken fazla ilgilenildim ve sevildim. H...,Anxiety


In [24]:
one_hot_encoded = pd.get_dummies(total_df['labels'])

concat_df = pd.concat([total_df['text'], one_hot_encoded], axis=1)
concat_df = concat_df.dropna().reset_index(drop=True)

concat_df.iloc[:, 1:] = concat_df.iloc[:, 1:].astype(int)

concat_df['all_labels'] = concat_df.apply(lambda row: [label for label in one_hot_encoded.columns if row[label] == 1], axis=1)
concat_df = concat_df.loc[:, ['text', 'Depression', 'Anxiety', 'all_labels']]

In [25]:
from datasets import Dataset, DatasetDict
Dataset.cleanup_cache_files
from sklearn.model_selection import train_test_split

train, test = train_test_split(concat_df, test_size = 0.25, random_state=42)

train_data = Dataset.from_pandas(train, preserve_index=False)
test_data = Dataset.from_pandas(test, preserve_index=False)

hg_data = DatasetDict({
    "train": train_data,
    "test": test_data
})

In [26]:
hg_data

DatasetDict({
    train: Dataset({
        features: ['text', 'Depression', 'Anxiety', 'all_labels'],
        num_rows: 21285
    })
    test: Dataset({
        features: ['text', 'Depression', 'Anxiety', 'all_labels'],
        num_rows: 7096
    })
})

In [27]:
!huggingface-cli login --token=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

hg_data.push_to_hub("halilibr/dilbazlar-anxiety-depression-recognition-multilabel-tr-dataset")
print("Data was pushed :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Data was pushed :)


In [28]:
hg_data.save_to_disk('../Data/Depression_Disorders_Data/depression_anxiety_multilabel_hg_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/21285 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7096 [00:00<?, ? examples/s]

## Training

In [29]:
import warnings 
warnings.filterwarnings("ignore")

In [30]:
from datasets import load_from_disk
import pandas as pd

# Load the dataset from disk
dataset = load_from_disk('../Data/Depression_Disorders_Data/depression_anxiety_multilabel_hg_dataset')

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'Depression', 'Anxiety', 'all_labels'],
        num_rows: 21285
    })
    test: Dataset({
        features: ['text', 'Depression', 'Anxiety', 'all_labels'],
        num_rows: 7096
    })
})

In [32]:
classes = ['Depression', 'Anxiety']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [33]:
class2id

{'Depression': 0, 'Anxiety': 1}

In [34]:
from transformers import AutoTokenizer

model_path = 'dbmdz/bert-base-turkish-128k-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [35]:
def preprocess_function(example):
    
    text = f"{example['text']}"

    
    all_labels = example['all_labels']
    labels = [0. for i in range(len(classes))]

    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1.

    example = tokenizer(text, truncation=True)
    example['labels'] = labels
    return example

tokenized_dataset = dataset.map(preprocess_function)


Map:   0%|          | 0/21285 [00:00<?, ? examples/s]

Map:   0%|          | 0/7096 [00:00<?, ? examples/s]

In [62]:
tokenized_dataset['test'][0]

{'text': 'Sadece her zaman dile ilgim olduğu için öğrenmeyi denedim..ama hiçbir zaman gerçek yaşam durumlarında pratik yapma veya kullanma fırsatım olmadı, çünkü çoğu insanla etkileşime girmem gerekiyor.',
 'Depression': 0,
 'Anxiety': 1,
 'all_labels': ['Anxiety'],
 'input_ids': [2,
  2577,
  2110,
  2211,
  4095,
  41574,
  15500,
  8059,
  28519,
  3338,
  19312,
  18,
  18,
  2156,
  42283,
  2211,
  16599,
  28932,
  27691,
  7089,
  4386,
  2358,
  6657,
  38174,
  6853,
  16,
  29146,
  63566,
  24178,
  16061,
  5055,
  1011,
  33351,
  3814,
  18,
  3],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [0.0, 1.0]}

In [36]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [38]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
import os
os.environ["WANDB_PROJECT"]="Dilbazlar"
wandb_api_key = "04a083b14d60688b24482e00727ebcc57448ef88"

In [40]:
training_args = TrainingArguments(
   output_dir="depression_disorders_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=3,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   report_to=["wandb"],# Wandb = https://docs.wandb.ai/guides/integrations/huggingface
   run_name="depression-specific-augmented-model"
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhalil7hatun[0m ([33muniteks[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.183,0.266273,0.950395,0.950395,0.950395,0.950395
2,0.105,0.270781,0.953918,0.953911,0.954046,0.953777


TrainOutput(global_step=14190, training_loss=0.20305133319556334, metrics={'train_runtime': 1808.9339, 'train_samples_per_second': 23.533, 'train_steps_per_second': 7.844, 'total_flos': 1384265290018680.0, 'train_loss': 0.20305133319556334, 'epoch': 2.0})

## Inference

In [49]:
import torch
device = "cuda" if torch.cuda.is_available() else 'cpu'

In [50]:
anxiety_labels = ['Depression', 'Anxiety']


def predict(model, tokenizer, labels, input_text):
    # Tokenize the input (ensure the tokenizer is appropriate for your model)
    inputs = tokenizer(input_text, max_length=150, padding="max_length", truncation=True, return_tensors="pt")
    
    # Move the inputs to the appropriate device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Disable gradient computation for inference
    with torch.no_grad():
        # Forward pass to get outputs
        outputs = model(**inputs)
        
        # Get the prediction
        # Note: `AutoModel` might not include logits. Ensure you use the appropriate model class for your task.
        if hasattr(outputs, 'logits'):
            preds = torch.argmax(outputs.logits, dim=-1)
        else:
            # Handle the case where the model does not have logits (e.g., outputs are raw hidden states)
            preds = torch.argmax(outputs[0], dim=-1)
    
    # Convert prediction to numpy array and print (if needed)
    prediction = preds.cpu().numpy()[0]
    print(outputs)
    return labels[prediction] 

In [59]:
input_text = "Uykusuzum"

predict(trainer.model, trainer.tokenizer, anxiety_labels, input_text)

SequenceClassifierOutput(loss=None, logits=tensor([[ 5.1898, -5.2543]], device='cuda:0'), hidden_states=None, attentions=None)


'Depression'