In [43]:
import warnings 
warnings.filterwarnings("ignore")

# Veriyi Diskten Yükleme

Bu kod, diskten bir veri kümesini yükler ve bir `datasets` nesnesine atar. Aşağıda her bir satırın açıklamaları yer almaktadır:


In [1]:
from datasets import load_from_disk
import pandas as pd

# Load the dataset from disk
dataset = load_from_disk('../Data/Depression_Disorders_Data/depression_multilabel_augmented_cleaned_hg_dataset')

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'Distimi', 'PMDD', 'all_labels'],
        num_rows: 20697
    })
    test: Dataset({
        features: ['text', 'Distimi', 'PMDD', 'all_labels'],
        num_rows: 6900
    })
})

In [3]:
classes = ['Distimi', 'PMDD']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [4]:
class2id

{'Distimi': 0, 'PMDD': 1}

In [7]:
from transformers import AutoTokenizer

model_path = 'dbmdz/bert-base-turkish-128k-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [8]:
def preprocess_function(example):
    
    text = f"{example['text']}"

    
    all_labels = example['all_labels']
    labels = [0. for i in range(len(classes))]

    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1.

    example = tokenizer(text, truncation=True)
    example['labels'] = labels
    return example

tokenized_dataset = dataset.map(preprocess_function)


Map:   0%|          | 0/20697 [00:00<?, ? examples/s]

Map:   0%|          | 0/6900 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import os
os.environ["WANDB_PROJECT"]="Dilbazlar"
wandb_api_key = "04a083b14d60688b24482e00727ebcc57448ef88"

In [13]:
training_args = TrainingArguments(
   output_dir="depression_disorders_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   report_to=["wandb"],# Wandb = https://docs.wandb.ai/guides/integrations/huggingface
   run_name="depression-specific-augmented-model"
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhalil7hatun[0m ([33muniteks[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6354,0.605054,0.817174,0.81732,0.816669,0.817971
2,0.4762,0.638756,0.839855,0.839809,0.840052,0.839565


TrainOutput(global_step=13798, training_loss=0.5448460399905811, metrics={'train_runtime': 1573.1445, 'train_samples_per_second': 26.313, 'train_steps_per_second': 8.771, 'total_flos': 823033992272400.0, 'train_loss': 0.5448460399905811, 'epoch': 2.0})

## Push To Hub

In [15]:
!huggingface-cli login --token=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

trainer.model.push_to_hub("halilibr/dilbazlar-depression-disorders-recognition-tr-model-acc-84")
trainer.tokenizer.push_to_hub("halilibr/dilbazlar-depression-disorders-recognition-tr-model-acc-84")
print("It was published :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


model.safetensors:   0%|          | 0.00/737M [00:00<?, ?B/s]

It was published :)
