## Create Dataset (Only run when datasets are changed)

In [1]:
import pandas as pd
import os
import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [23]:
depression_data = pd.read_excel('../Data/Depression_Disorders_Data/depression_total_cleaned.xlsx')
depression_data['labels'] = "Depression"

normal_data = pd.read_excel('../Data/normal_dataset_new.xlsx').sample(30000)
normal_data.drop('Text', axis=1, inplace=True)
normal_data = normal_data.rename(columns={'Translated_Text': 'text'})
normal_data['labels'] = 'Normal'

dataset = pd.concat([depression_data, normal_data], axis=0)
dataset = dataset.dropna().reset_index(drop=True)

In [24]:
dataset

Unnamed: 0,text,labels
0,Çok yoğun ve düzensiz geçen günlerin ardından ...,Depression
1,Gerçekten çok kötü hissediyorum ve sakin kalma...,Depression
2,"Her ay, kendime bunun hormonlar olduğunu ve ge...",Depression
3,Ağrım o kadar şiddetli ki bazen kendimi öldürm...,Depression
4,Bu bebeği gerçekten istiyorum ama aynı zamanda...,Depression
...,...,...
57583,"Bu olağanüstü bir akşam yemeği partisi olurdu,...",Normal
57584,Ağabeyim 28 yaşında ve ailenin en büyüğü kim?,Normal
57585,"Şimdi, bu çeki nakde çevirmeye ne dersiniz, ha...",Normal
57586,Her yönetmenle anlaşmazlık yaşadım. Ben 'Ve il...,Normal


### Push Dataset to HuggingFace

In [27]:
from datasets import Dataset, DatasetDict

train, test = train_test_split(dataset, test_size = 0.3, random_state=42)

train_data = Dataset.from_pandas(train, preserve_index=False)
test_data = Dataset.from_pandas(test, preserve_index=False)

hg_data = DatasetDict({
    "train": train_data,
    "test": test_data
})

In [28]:
hg_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 40311
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 17277
    })
})

In [15]:
!huggingface-cli login --token=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

hg_data.push_to_hub("halilibr/dilbazlar-depression-binary-tr-dataset")
print("Data was pushed :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Data was pushed :)


In [30]:
dataset = hg_data

## Start

In [31]:
import json
import os
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import BertModel, BertTokenizer
import warnings
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm

# Huggingface
from datasets import load_dataset
import huggingface_hub

warnings.filterwarnings('ignore')

# specify GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Device: ", device)

!huggingface-cli login --token=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

Device:  cuda
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


In [32]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")


def tokenize_function(examples):
   label_mapping = {
        "Normal": 0,
        "Depression": 1
    }
   inputs = tokenizer(examples["text"], padding="max_length", max_length=150, truncation=True)

   labels = [label_mapping[label] for label in examples["labels"]]
   labels = torch.tensor(labels)

   return {
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"],
    "labels": labels,
   }

tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)

Map:   0%|          | 0/40311 [00:00<?, ? examples/s]

Map:   0%|          | 0/17277 [00:00<?, ? examples/s]

In [33]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Trainer Approach

In [25]:
id2label = {
    0: "Normal",
    1: "Disorder"
}
label2id = {
    "Normal": 0,
    "Disorder": 1
}

In [26]:
from transformers import AutoModelForTokenClassification 

model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-128k-uncased", num_labels=2)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

### Evaluation Metrics

In [60]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [61]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [62]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="temp_disorder_detect",
    learning_rate=0.00005,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard'
)

In [63]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [64]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Expected input batch_size (4400) to match target batch_size (16).

## Native Pytorch 

In [30]:
# del model
# del trainer
# torch.cuda.empty_cache()

In [34]:
tokenized_datasets = tokenized_dataset.remove_columns(["text"])

In [35]:
tokenized_datasets.set_format("torch")

In [36]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8) 

In [37]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-128k-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [39]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [40]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/15117 [00:00<?, ?it/s]

In [41]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9839671239219772}

In [42]:
model.push_to_hub('halilibr/dilbazlar-depression-binary-detection-model-acc-98.3')

model.safetensors:   0%|          | 0.00/737M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/halilibr/dilbazlar-depression-binary-detection-model-acc-98.3/commit/f1cc3eb235bf33096e2f10a7cf8c3e86056ac849', commit_message='Upload BertForSequenceClassification', commit_description='', oid='f1cc3eb235bf33096e2f10a7cf8c3e86056ac849', pr_url=None, pr_revision=None, pr_num=None)

In [43]:
tokenizer.push_to_hub("halilibr/dilbazlar-depression-binary-detection-model-acc-98.3")

CommitInfo(commit_url='https://huggingface.co/halilibr/dilbazlar-depression-binary-detection-model-acc-98.3/commit/7dbb0aaa49d1e295da2a83d8d917c3d73fe53768', commit_message='Upload tokenizer', commit_description='', oid='7dbb0aaa49d1e295da2a83d8d917c3d73fe53768', pr_url=None, pr_revision=None, pr_num=None)

## Inference

In [45]:
test_data[100]

{'Text': 'Sosyal ortamlarda rahat bir şekilde iletişim kurabiliyorum.',
 'Label': 'Normal'}

In [42]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

loaded_model = AutoModelForSequenceClassification.from_pretrained("halilibr/dilbazlar-binary-disorder-detection-model-acc-98.5", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("halilibr/dilbazlar-binary-disorder-detection-model-acc-98.5")

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

# Ensure model is in evaluation mode
loaded_model.eval()

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/737M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.30M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [44]:
# Example input


def predict(input_text):
    # Tokenize the input (ensure the tokenizer is appropriate for your model)
    inputs = tokenizer(input_text, max_length=150, padding="max_length", truncation=True, return_tensors="pt")
    
    # Move the inputs to the appropriate device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Disable gradient computation for inference
    with torch.no_grad():
        # Forward pass to get outputs
        outputs = loaded_model(**inputs)
        
        # Get the prediction
        # Note: `AutoModel` might not include logits. Ensure you use the appropriate model class for your task.
        if hasattr(outputs, 'logits'):
            preds = torch.argmax(outputs.logits, dim=-1)
        else:
            # Handle the case where the model does not have logits (e.g., outputs are raw hidden states)
            preds = torch.argmax(outputs[0], dim=-1)
    
    # Convert prediction to numpy array and print (if needed)
    prediction = preds.cpu().numpy()[0]
    print(outputs)
    return prediction

In [47]:
input_text = "geçen annemlerle oturuyorum"

predict(input_text)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.8659,  1.5566]], device='cuda:0'), hidden_states=None, attentions=None)


1

In [39]:
daily_phrases = [
    "Bugün hava çok güzel.",
    "İşe gitmem gerekiyor.",
    "Kahvaltı yaptın mı?",
    "Sinemaya gitmek ister misin?",
    "Bu hafta sonu ne yapıyorsun?",
    "Yeni bir kitap okuyorum.",
    "Bu yemek harika olmuş.",
    "Arkadaşlarla buluşacağım.",
    "Yürüyüşe çıkalım mı?",
    "Yeni bir hobi edindim.",
    "Bu filmi izledin mi?",
    "Tatilde nereye gitmek istersin?",
    "Hangi sporu yapıyorsun?",
    "Ders çalışmam gerekiyor.",
    "Bu hafta sonu bir parti var.",
    "Alışverişe çıkmak istiyorum.",
    "Spor salonuna gidiyorum.",
    "Yeni bir tarif denedim.",
    "Bu hafta yoğun geçecek.",
    "Yeni bir projeye başladım.",
    "Yarın toplantım var.",
    "Bahçede çalışmak istiyorum.",
    "Yeni bir müzik albümü keşfettim.",
    "Bu kitabı önerir misin?",
    "Yarın sabah erken kalkmalıyım.",
    "Ailemi ziyaret edeceğim.",
    "Yeni bir dil öğreniyorum.",
    "Bu dizi çok güzel.",
    "Yoga yapmaya başladım.",
    "İşe erken gitmem gerekiyor.",
    "Bu hafta sonu dinlenmek istiyorum.",
    "Yeni bir restoran denedim.",
    "Hafta sonu planın nedir?",
    "Yeni bir arkadaş edindim.",
    "Bu film harikaydı.",
    "Bisiklet sürmeye çıkacağım.",
    "Bugün spor yapmalıyım.",
    "Yeni bir bilgisayar aldım.",
    "Bu kitabı bitirdim.",
    "Yeni bir iş görüşmem var.",
    "Evin dekorasyonunu değiştirdim.",
    "Bu sabah yürüyüşe çıktım.",
    "Yeni bir telefon aldım.",
    "Bu hafta yoğun çalışacağım.",
    "Hafta sonu tatile çıkıyorum.",
    "Yeni bir proje üzerinde çalışıyorum.",
    "İşler yolunda gidiyor.",
    "Bu kitabı sevdim.",
    "Yemek yapmayı çok seviyorum."
]



0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
