## Create Dataset (Only run when datasets are changed)

In [37]:
import pandas as pd
import os
import glob
from tqdm import tqdm

In [49]:
disorder_texts = []
for expurgated_data_path in glob.glob('../Data/Expurgated_Data/Reddit/*'):
    #file_name = expurgated_data_path.split('/')[-1].split('.')[0]
    disorder_df = pd.read_excel(expurgated_data_path)
    disorder_texts.extend(disorder_df['Translated_Text'].values)

disorder_df = pd.DataFrame({
    "Text": disorder_texts,
    "Label": "Disorder"
})

disorder_df.head()

Unnamed: 0,Text,Label
0,"Sıkışmış hissetmek (yerine tekrar giriş yok, y...",Disorder
1,Yakın zamanda başka bir şehre taşındım ve nele...,Disorder
2,"Panik atak geçirmenin eşiğindeydim, sadece bun...",Disorder
3,Geçen hafta maruz kalma terapimi yaparken soka...,Disorder
4,Bazen özgüven ve benlik imajıyla çok mücadele ...,Disorder


In [39]:
normal_data = pd.read_csv('../Data/Disorder_Detection_Datasets/normal_dataset_translated_chunk_1.csv')
normal_data.head()

Unnamed: 0,Text,Label,Translated_Text
0,So let's be clear here. I'm totally fine.\n\nI...,Normal,Yani burada açık olalım. Ben tamamen iyiyim. B...
1,"I’m a 35 year old man, the sole income of a ho...",Normal,"Ben 35 yaşında bir adamım, 3 kişilik bir evin ..."
2,My life is over I’m about to be 21 and I alrea...,Normal,Hayatım bitti 21 olmak üzereyim ve zaten ölü h...
3,I just want to outlet. I’m running out of opti...,Normal,Sadece çıkış yapmak istiyorum. Seçeneklerim tü...
4,Every time I start to feel depressed (It comes...,Normal,Ne zaman depresif hissetmeye başlasam (Dalgala...


In [40]:
total_df = pd.DataFrame()

total_df['Text'] = normal_data['Translated_Text']
total_df['Label'] = normal_data['Label']

total_df = pd.concat([total_df, disorder_df], axis=0).reset_index(drop=True)

In [41]:
total_df['Label'].value_counts()

Label
Disorder    8850
Normal      5000
Name: count, dtype: int64

In [43]:
total_df.dropna(inplace=True)

total_df.to_excel('../Data/Disorder_Detection_Datasets/main_dataset.xlsx', index=False)
print("Saved")

Saved


### Push Dataset to HuggingFace

In [52]:
from datasets import Dataset, DatasetDict

train, test = train_test_split(total_df, test_size = 0.3, random_state=42)

train_data = Dataset.from_pandas(train, preserve_index=False)
test_data = Dataset.from_pandas(test, preserve_index=False)

hg_data = DatasetDict({
    "train": train_data,
    "test": test_data
})

In [53]:
hg_data

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label'],
        num_rows: 9694
    })
    test: Dataset({
        features: ['Text', 'Label'],
        num_rows: 4155
    })
})

In [54]:
!huggingface-cli login --token=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

hg_data.push_to_hub("halilibr/dilbazlar-binary-disorder-detection-dataset", private=True)
print("Data was pushed :)")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/407 [00:00<?, ?B/s]

Data was pushed :)


## Start

In [6]:
import json
import os
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import BertModel, BertTokenizer
import warnings
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm

# Huggingface
from datasets import load_dataset
import huggingface_hub

warnings.filterwarnings('ignore')

# specify GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Device: ", device)


!huggingface-cli login --token=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

Device:  cuda
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\halilibrahim.hatun\.cache\huggingface\token
Login successful


In [7]:
dataset = load_dataset("halilibr/dilbazlar-binary-disorder-detection-dataset")
dataset["train"][100]

{'Text': 'Kaygılarım neredeyse yok oluyor.', 'Label': 'Disorder'}

In [8]:
dataset = dataset.rename_column("Label", "labels")
dataset = dataset.rename_column("Text", "text")

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")


def tokenize_function(examples):
   label_mapping = {
        "Normal": 0,
        "Disorder": 1
    }
   inputs = tokenizer(examples["text"], max_length=150, padding="max_length", truncation=True)

   labels = [label_mapping[label] for label in examples["labels"]]
   labels = torch.tensor(labels)

   return {
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"],
    "labels": labels,
   }

tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Trainer Approach

In [57]:
id2label = {
    0: "Normal",
    1: "Disorder"
}
label2id = {
    "Normal": 0,
    "Disorder": 1
}

In [1]:
from transformers import AutoModelForTokenClassification 

model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-128k-uncased", num_labels=2)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

### Evaluation Metrics

In [60]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [61]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [62]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="temp_disorder_detect",
    learning_rate=0.00005,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard'
)

In [63]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [64]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Expected input batch_size (4400) to match target batch_size (16).

## Native Pytorch 

In [12]:
# del model
# del trainer
# torch.cuda.empty_cache()

NameError: name 'model' is not defined

In [12]:
tokenized_datasets = tokenized_dataset.remove_columns(["text"])

In [13]:
tokenized_datasets.set_format("torch")

In [14]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8)

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-128k-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [70]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [71]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3636 [00:00<?, ?it/s]

In [134]:
for batch in eval_dataloader:
    print(batch)

{'labels': tensor([1, 1, 1, 0, 0, 0, 1, 0]), 'input_ids': tensor([[    2, 16428, 14779,  ...,     0,     0,     0],
        [    2,  5389,    16,  ...,     0,     0,     0],
        [    2, 75577,  2257,  ...,     0,     0,     0],
        ...,
        [    2, 13336,  1023,  ...,     0,     0,     0],
        [    2, 39348, 75582,  ...,     0,     0,     0],
        [    2,  2299, 11497,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
{'labels': tensor([1, 1, 1, 1, 0, 0, 0, 1]), 'input_ids': tensor([[     2,   7747,   1947,  ...,      0,      0,      0],
        [     2,  73427,   2884,  ...,      0,      0,      0],
        [     2, 101061,   2537,  ...,      0,      0,      0],
        ...,
        [     2,   1964,  40395,  ...,      0,      0,      0],
        [     2

In [72]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9237063778580024}

In [104]:
model.push_to_hub('halilibr/dilbazlar-binary-disorder-detection-model-acc-92')

CommitInfo(commit_url='https://huggingface.co/halilibr/dilbazlar-binary-disorder-detection-model-acc-92/commit/c1a11ffcbaab88cf45414db2e0dbf1679d394869', commit_message='Upload BertForSequenceClassification', commit_description='', oid='c1a11ffcbaab88cf45414db2e0dbf1679d394869', pr_url=None, pr_revision=None, pr_num=None)

In [101]:
tokenizer.push_to_hub("halilibr/dilbazlar-binary-disorder-detection-model-acc-92")

CommitInfo(commit_url='https://huggingface.co/halilibr/dilbazlar-binary-disorder-detection-model-acc-92/commit/5e74160ed141476645ca0a0e9a9c2c6c703b36ce', commit_message='Upload tokenizer', commit_description='', oid='5e74160ed141476645ca0a0e9a9c2c6c703b36ce', pr_url=None, pr_revision=None, pr_num=None)

In [124]:
from transformers import AutoModelForSequenceClassification

loaded_model = AutoModelForSequenceClassification.from_pretrained("halilibr/dilbazlar-binary-disorder-detection-model-acc-92", num_labels=2)

In [125]:
loaded_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [115]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("halilibr/dilbazlar-binary-disorder-detection-model-acc-92")
# loaded_model = AutoModel.from_pretrained("halilibr/dilbazlar-binary-disorder-detection-model-acc-92", num_labels=2)

In [140]:
total_df['Text'][0]

'Yani burada açık olalım. Ben tamamen iyiyim. Bu yıl benim için oldukça havalı ve sık sık bir olay olmayan birkaç arkadaş edindim. Birkaçı benimle takılmak istedi ama lanet olasıca bir şey daha oldu. Ama tabii ki, geri itmeye başladılar. Çok yakınlaşamazsın çünkü hepimiz onların ayrılacağını biliyoruz. Aynada gülümsemeye çalıştım ama doğru görünmedi. Sonra kendimi rahatsız etmek için düşündüm.'

In [141]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

loaded_model = AutoModelForSequenceClassification.from_pretrained("halilibr/dilbazlar-binary-disorder-detection-model-acc-92", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("halilibr/dilbazlar-binary-disorder-detection-model-acc-92")

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

# Ensure model is in evaluation mode
loaded_model.eval()

# Example input
input_text = "geçen agalarla bir muhabbet ediyoruz görmen lazım"

# Tokenize the input (ensure the tokenizer is appropriate for your model)
inputs = tokenizer(input_text, max_length=150, padding="max_length", truncation=True, return_tensors="pt")

# Move the inputs to the appropriate device
inputs = {k: v.to(device) for k, v in inputs.items()}

print(inputs)

# Disable gradient computation for inference
with torch.no_grad():
    # Forward pass to get outputs
    outputs = loaded_model(**inputs)
    
    # Get the prediction
    # Note: `AutoModel` might not include logits. Ensure you use the appropriate model class for your task.
    if hasattr(outputs, 'logits'):
        preds = torch.argmax(outputs.logits, dim=-1)
    else:
        # Handle the case where the model does not have logits (e.g., outputs are raw hidden states)
        preds = torch.argmax(outputs[0], dim=-1)

# Convert prediction to numpy array and print (if needed)
prediction = preds.cpu().numpy()[0]

# Print the predicted class
print("Predicted class:", prediction)


{'input_ids': tensor([[     2,  24727,  26581,   2638,   1947,  10545,   5423, 106282,   1929,
           4506,      3,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              

In [136]:
total_df = pd.read_excel('../Data/Disorder_Detection_Datasets/main_dataset.xlsx')

In [138]:
total_df['Text'][0]

'Yani burada açık olalım. Ben tamamen iyiyim. Bu yıl benim için oldukça havalı ve sık sık bir olay olmayan birkaç arkadaş edindim. Birkaçı benimle takılmak istedi ama lanet olasıca bir şey daha oldu. Ama tabii ki, geri itmeye başladılar. Çok yakınlaşamazsın çünkü hepimiz onların ayrılacağını biliyoruz. Aynada gülümsemeye çalıştım ama doğru görünmedi. Sonra kendimi rahatsız etmek için düşündüm.'