In [1]:
!pip install datasets transformers evaluate sentencepiece accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using ca

In [2]:
from datasets import load_dataset

dataset = load_dataset('ensarcitak/dilbazlar-multilabel-disorder-detection-depression-anxiety-dataset')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/487 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/371k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13726 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3432 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label_1', 'Label_2', 'all_labels'],
        num_rows: 13726
    })
    test: Dataset({
        features: ['Text', 'Label_1', 'Label_2', 'all_labels'],
        num_rows: 3432
    })
})

In [4]:
classes = ["Depresyon", "Anksiyete"]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [5]:
class2id

{'Depresyon': 0, 'Anksiyete': 1}

In [6]:
from transformers import AutoTokenizer

model_path = 'dbmdz/bert-base-turkish-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [7]:
def preprocess_function(example):
   text = f"{example['Text']}"
   all_labels = example['all_labels']
   labels = [0. for i in range(len(classes))]
   for label in all_labels:
       label_id = class2id[label]
       labels[label_id] = 1.

   example = tokenizer(text, truncation=True)
   example['labels'] = labels
   return example

tokenized_dataset = dataset.map(preprocess_function)

Map:   0%|          | 0/13726 [00:00<?, ? examples/s]

Map:   0%|          | 0/3432 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [9]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained( model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(

   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=5,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)



In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3129,0.315192,0.917395,0.937562,0.934783,0.940358
2,0.2379,0.315431,0.918852,0.938784,0.934164,0.94345
3,0.1845,0.457494,0.912587,0.934311,0.926199,0.942567
4,0.0882,0.539904,0.917686,0.937768,0.935193,0.940358
5,0.0192,0.577763,0.917978,0.93803,0.93484,0.941241


TrainOutput(global_step=22880, training_loss=0.1719614066235669, metrics={'train_runtime': 2115.8455, 'train_samples_per_second': 32.436, 'train_steps_per_second': 10.814, 'total_flos': 2220290390649480.0, 'train_loss': 0.1719614066235669, 'epoch': 5.0})

In [13]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.31519201397895813, 'eval_accuracy': 0.9173951048951049, 'eval_f1': 0.9375619425173439, 'eval_precision': 0.9347826086956522, 'eval_recall': 0.9403578528827038, 'eval_runtime': 30.3504, 'eval_samples_per_second': 113.079, 'eval_steps_per_second': 37.693, 'epoch': 5.0}


In [14]:
!huggingface-cli login --token=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
model.push_to_hub('ensarcitak/dilbazlar-multilabel-depression-anxiety-detection-model-acc-91')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ensarcitak/dilbazlar-multilabel-depression-anxiety-detection-model-acc-91/commit/f47e91af828488a0cef65342f83d0c0e9b358b86', commit_message='Upload BertForSequenceClassification', commit_description='', oid='f47e91af828488a0cef65342f83d0c0e9b358b86', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
tokenizer.push_to_hub("ensarcitak/dilbazlar-multilabel-depression-anxiety-detection-model-acc-91")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ensarcitak/dilbazlar-multilabel-depression-anxiety-detection-model-acc-91/commit/21b5606ed75a0e4e765ffb5b27a54b13bacf2048', commit_message='Upload tokenizer', commit_description='', oid='21b5606ed75a0e4e765ffb5b27a54b13bacf2048', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "ensarcitak/dilbazlar-multilabel-depression-anxiety-detection-model-acc-91"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/766k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [18]:
def preprocess_sentence(sentence):
    inputs = tokenizer(sentence, truncation=True, padding=True, return_tensors="pt")
    return inputs

In [19]:
import torch
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predict(sentence):
    model.eval()
    inputs = preprocess_sentence(sentence)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = sigmoid(logits).squeeze().tolist()
    return predictions

In [29]:
sentence = "iyi hissetmiyorum"
predictions = predict(sentence)
print(predictions)


[0.9900333285331726, 0.009433054365217686]
