In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("../data/raw/teknofest_train_final.csv",
                 sep="|")
df["pred"] = "nan"
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,id,text,is_offensive,target,pred
0,81c11060-a240-4d54-841b-9e2916039e85,çürük dişli,1,INSULT,
1,be80ebbf-b322-4c3b-afa1-94932ea80731,bu adamın islama ve müslümanlara verdiği zarar...,1,RACIST,
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,erkekler zora gelmez,1,SEXIST,
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,utanmazın götüne kazık sokmuşlar bu tıkırtı ne...,1,PROFANITY,
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,otomasyon< sistemlerine= doğrudan bağlanabilir,0,OTHER,


In [3]:
enc = OneHotEncoder(sparse=False)

In [4]:
df["target_encoded"] = enc.fit_transform(df["target"].values.reshape(-1,1)).tolist()



In [5]:
skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=1337)

splits = list(skf.split(df, df["target"]))

In [6]:
from transformers import AutoTokenizer

model_name = "dbmdz/bert-base-turkish-128k-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
import numpy as np
import torch


class TeknofestDataset(torch.utils.data.Dataset):
    def __init__(self, df, num_classes=5):
        self.df = df
        self.num_classes = num_classes
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text, label = row.text, row.target_encoded
        encoding = tokenizer(text, max_length=128, padding="max_length", truncation=True)
        encoding = {key: torch.tensor(val) for key, val in encoding.items()}
        encoding["label"] = np.array(label).astype(np.float32).tolist()
        return dict(encoding)

In [8]:
from transformers import AutoModelForSequenceClassification
from datasets import load_metric
from transformers import TrainingArguments, Trainer
from torch import nn

def multiclass_logit2label(logits):
    sig = nn.Sigmoid()
    probs = sig(torch.FloatTensor(logits))

    max_idx = torch.argmax(probs, 1, keepdim=True)
    one_hot = torch.zeros(logits.shape)
    one_hot.scatter_(1, max_idx, 1)
    return one_hot.numpy()

In [14]:
batch_size = 64

for split_id, (train_idx, val_idx) in tqdm(enumerate(splits)):
    train_dataset = TeknofestDataset(df.iloc[train_idx].reset_index(drop=True))
    val_dataset = TeknofestDataset(df.iloc[val_idx].reset_index(drop=True))
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=5,
                                                              ignore_mismatched_sizes=True)
    
    training_args = TrainingArguments(
    f"teknofest23_{model_name.split('/')[-1]}_fold{split_id}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
#     learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
#     weight_decay=0.01,
    load_best_model_at_end=False,
#     metric_for_best_model="f1",
    )

    trainer = Trainer(                  
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    trainer.train()
    
    preds = trainer.predict(val_dataset)
    
    preds = enc.inverse_transform(
        multiclass_logit2label(preds.predictions)
    ).flatten()
    
    df.loc[val_idx, "pred"] = preds

0it [00:00, ?it/s]loading configuration file config.json from cache at /container_cache/huggingface/hub/models--dbmdz--bert-base-turkish-128k-uncased/snapshots/f5287aecee60f0c597c11c34341cb92d31c0e71b/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-turkish-128k-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use

Epoch,Training Loss,Validation Loss
1,No log,0.098842
2,No log,0.095592
3,No log,0.097167
4,0.109100,0.091989
5,0.109100,0.090763


***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-158
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-158/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-158/pytorch_model.bin
tokenizer config file saved in teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-158/tokenizer_config.json
Special tokens file saved in teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-158/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-316
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-316/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold0/checkpoint-316/pytorch_model.bin
tokenizer config file saved in tek

1it [02:46, 166.92s/it]loading configuration file config.json from cache at /container_cache/huggingface/hub/models--dbmdz--bert-base-turkish-128k-uncased/snapshots/f5287aecee60f0c597c11c34341cb92d31c0e71b/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-turkish-128k-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
 

Epoch,Training Loss,Validation Loss
1,No log,0.099233
2,No log,0.075663
3,No log,0.079283
4,0.097800,0.078478
5,0.097800,0.076619


***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-158
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-158/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-158/pytorch_model.bin
tokenizer config file saved in teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-158/tokenizer_config.json
Special tokens file saved in teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-158/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-316
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-316/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold1/checkpoint-316/pytorch_model.bin
tokenizer config file saved in tek

2it [05:33, 166.43s/it]loading configuration file config.json from cache at /container_cache/huggingface/hub/models--dbmdz--bert-base-turkish-128k-uncased/snapshots/f5287aecee60f0c597c11c34341cb92d31c0e71b/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-turkish-128k-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
 

Epoch,Training Loss,Validation Loss
1,No log,0.094977
2,No log,0.08321
3,No log,0.081473
4,0.105100,0.090126
5,0.105100,0.087209


***** Running Evaluation *****
  Num examples = 2523
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-158
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-158/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-158/pytorch_model.bin
tokenizer config file saved in teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-158/tokenizer_config.json
Special tokens file saved in teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-158/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-316
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-316/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold2/checkpoint-316/pytorch_model.bin
tokenizer config file saved in tek

3it [08:13, 163.75s/it]loading configuration file config.json from cache at /container_cache/huggingface/hub/models--dbmdz--bert-base-turkish-128k-uncased/snapshots/f5287aecee60f0c597c11c34341cb92d31c0e71b/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-turkish-128k-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
 

Epoch,Training Loss,Validation Loss
1,No log,0.088655
2,No log,0.085967
3,No log,0.083617
4,0.091100,0.076185
5,0.091100,0.079169


***** Running Evaluation *****
  Num examples = 2523
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-158
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-158/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-158/pytorch_model.bin
tokenizer config file saved in teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-158/tokenizer_config.json
Special tokens file saved in teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-158/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-316
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-316/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold3/checkpoint-316/pytorch_model.bin
tokenizer config file saved in tek

4it [10:54, 162.55s/it]loading configuration file config.json from cache at /container_cache/huggingface/hub/models--dbmdz--bert-base-turkish-128k-uncased/snapshots/f5287aecee60f0c597c11c34341cb92d31c0e71b/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-turkish-128k-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
 

Epoch,Training Loss,Validation Loss
1,No log,0.089254
2,No log,0.081264
3,No log,0.080185
4,0.102400,0.077118
5,0.102400,0.077072


***** Running Evaluation *****
  Num examples = 2523
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-158
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-158/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-158/pytorch_model.bin
tokenizer config file saved in teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-158/tokenizer_config.json
Special tokens file saved in teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-158/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 64
Saving model checkpoint to teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-316
Configuration saved in teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-316/config.json
Model weights saved in teknofest23_bert-base-turkish-128k-uncased_fold4/checkpoint-316/pytorch_model.bin
tokenizer config file saved in tek

5it [13:32, 162.50s/it]


In [11]:
print(classification_report(df["target"], df["pred"], digits=4))

              precision    recall  f1-score   support

      INSULT     0.9051    0.8991    0.9021      2419
       OTHER     0.9538    0.9422    0.9480      3616
   PROFANITY     0.9582    0.9462    0.9522      2398
      RACIST     0.9335    0.9551    0.9442      2072
      SEXIST     0.9387    0.9574    0.9480      2112

    accuracy                         0.9394     12617
   macro avg     0.9379    0.9400    0.9389     12617
weighted avg     0.9394    0.9394    0.9394     12617

