In [1]:
import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
data= pd.read_csv("depression_anxiety_final_data_for_training.csv")

data = data.sample(frac=1.0, random_state=42)

data.head()

Unnamed: 0,Label,Text
11292,Anksiyete,Kelimenin tam anlamıyla bazen ağız kaslarım ha...
6752,Anksiyete,Birini almaktan korkuyorum çünkü ilaçları aşır...
4695,Anksiyete,"Evde yalnızım, pencerelerin olmadığı ve dışarı..."
6146,Anksiyete,Endişem nedeniyle okul çalışmalarımın çoğunu z...
1413,Depresyon,Gerçek ve saf mutluluğun aslında nasıl bir his...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11669 entries, 11292 to 7270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   11669 non-null  object
 1   Text    11668 non-null  object
dtypes: object(2)
memory usage: 273.5+ KB


In [5]:
data = data[data['Text'].apply(lambda x: isinstance(x, str))]

In [6]:
labels = data['Label'].unique().tolist()
labels = [s.strip() for s in labels ]
labels

['Anksiyete', 'Depresyon']

In [7]:
for key, value in enumerate(labels):
    print(value)

Anksiyete
Depresyon


In [8]:
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}

In [9]:
label2id

{'Anksiyete': 0, 'Depresyon': 1}

In [10]:
id2label

{0: 'Anksiyete', 1: 'Depresyon'}

In [11]:
data["labels"]=data.Label.map(lambda x: label2id[x.strip()])

In [12]:
data.head()

Unnamed: 0,Label,Text,labels
11292,Anksiyete,Kelimenin tam anlamıyla bazen ağız kaslarım ha...,0
6752,Anksiyete,Birini almaktan korkuyorum çünkü ilaçları aşır...,0
4695,Anksiyete,"Evde yalnızım, pencerelerin olmadığı ve dışarı...",0
6146,Anksiyete,Endişem nedeniyle okul çalışmalarımın çoğunu z...,0
1413,Depresyon,Gerçek ve saf mutluluğun aslında nasıl bir his...,1


In [13]:
tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-turkish-uncased", max_length=512)

In [14]:
model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
SIZE = data.shape[0]

# Eğitim veri seti (İlk %70)
train_texts = list(data.Text[:int(0.7 * SIZE)])
train_labels = list(data.labels[:int(0.7 * SIZE)])

# Doğrulama veri seti (Sonraki %15)
val_texts = list(data.Text[int(0.7 * SIZE):int(0.85 * SIZE)])
val_labels = list(data.labels[int(0.7 * SIZE):int(0.85 * SIZE)])

# Test veri seti (Son %15)
test_texts = list(data.Text[int(0.85 * SIZE):])
test_labels = list(data.labels[int(0.85 * SIZE):])

In [16]:
len(train_texts)

8167

In [17]:
len(train_texts), len(val_texts), len(test_texts)

(8167, 1750, 1751)

In [18]:
# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [19]:
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)

In [20]:
train_dataloader = DataLoader(train_encodings, train_labels)

val_dataloader = DataLoader(val_encodings, val_labels)

test_dataset = DataLoader(test_encodings, test_labels)

In [21]:
from transformers import TrainingArguments, Trainer

In [22]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.
    
    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of 
              that observation belonging to a certain class.
              
    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids
    
    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)
    
    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    
    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)
    
    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [23]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./TTC4900Model', 
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0 
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory                 
    logging_dir='./multi-class-logs',            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps", 
    fp16=True,
    load_best_model_at_end=True
)



In [24]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned 
    model=model,
     # training arguments that we defined above                        
    args=training_args,                 
    train_dataset=train_dataloader,         
    eval_dataset=val_dataloader,            
    compute_metrics= compute_metrics
)

In [25]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
q=[trainer.evaluate(eval_dataset=data) for data in [train_dataloader, val_dataloader, test_dataset]]

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

In [None]:
def predict(text):
    """
    Predicts class labels for a given input text in a multilabel classification setting.

    Args:
        text (str): The input text for which the class labels need to be predicted.

    Returns:
        probs (torch.Tensor): Class probabilities for the input text.
        pred_labels (list of str): The predicted class labels.
    """
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    # Apply sigmoid function to the logits to obtain probabilities
    probs = torch.sigmoid(outputs.logits)

    # Determine which probabilities are greater than the threshold (0.5)
    threshold = 0.5
    pred_label_indices = (probs >= threshold).nonzero(as_tuple=True)[1]

    # Map the predicted class indices to the actual class labels
    pred_labels = [model.config.id2label[idx.item()] for idx in pred_label_indices]

    return probs, pred_labels

# Example usage:
text = "Bu örnek bir cümledir"
probs, pred_labels = predict(text)
print(f"Probabilities: {probs}")
print(f"Predicted Labels: {pred_labels}")


In [None]:
# Test with a an example text in Turkish
text = "gerçekten çok kötü hissediyorum."
predict(text)