In [43]:
import warnings 
warnings.filterwarnings("ignore")

In [44]:
import pandas as pd
from datasets import DatasetDict, Dataset

train = pd.read_parquet('../Data/Depression_Disorders_Data/train-00000-of-00001.parquet')
test = pd.read_parquet('../Data/Depression_Disorders_Data/test-00000-of-00001.parquet')

train = train.rename(columns={'Text': 'text'})
test = test.rename(columns={'Text': 'text'})

train_data = Dataset.from_pandas(train, preserve_index=False)
test_data = Dataset.from_pandas(test, preserve_index=False)

dataset = DatasetDict({
    "train": train_data,
    "test": test_data
})

In [45]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'Label_1', 'Label_2', 'all_labels'],
        num_rows: 28515
    })
    test: Dataset({
        features: ['text', 'Label_1', 'Label_2', 'all_labels'],
        num_rows: 7129
    })
})

In [46]:
classes = ['Distimi', 'PMDD']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [47]:
class2id

{'Distimi': 0, 'PMDD': 1}

In [48]:
from transformers import AutoTokenizer

model_path = 'dbmdz/bert-base-turkish-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [49]:
def preprocess_function(example):
    
    text = f"{example['text']}"

    
    all_labels = example['all_labels']
    labels = [0. for i in range(len(classes))]

    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1.

    example = tokenizer(text, truncation=True)
    example['labels'] = labels
    return example

tokenized_dataset = dataset.map(preprocess_function)


Map:   0%|          | 0/28515 [00:00<?, ? examples/s]

Map:   0%|          | 0/7129 [00:00<?, ? examples/s]

In [50]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [51]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [52]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
import os
os.environ["WANDB_PROJECT"]="Dilbazlar"
wandb_api_key = "04a083b14d60688b24482e00727ebcc57448ef88"

In [54]:
training_args = TrainingArguments(
   output_dir="depression_disorders_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   report_to=["wandb"],# Wandb = https://docs.wandb.ai/guides/integrations/huggingface
   run_name="depression-specific-augmented-model"
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5873,0.638254,0.789592,0.789562,0.789673,0.789452


Checkpoint destination directory depression_disorders_model\checkpoint-9505 already exists and is non-empty.Saving will proceed but saved results may be invalid.

KeyboardInterrupt

[34m[1mwandb[0m: Encountered an error while tearing down the service manager: [WinError 10054] An existing connection was forcibly closed by the remote host
Exception in thread IntMsgThr:


In [42]:
!huggingface-cli login --token=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

trainer.model.push_to_hub("halilibr/dilbazlar-depression-disorders-recognition-tr-model-acc-82")
trainer.tokenizer.push_to_hub("halilibr/dilbazlar-depression-disorders-recognition-tr-model-acc-82")
print("It was published :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


model.safetensors:   0%|          | 0.00/737M [00:00<?, ?B/s]

It was published :)
