In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,f1_score

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:21"

In [3]:
#external data with sources
url = 'https://drive.google.com/file/d/1tjOIxm-59iNot5AadrHOQg0pryZ5GDJ2/view?usp=share_link'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
####

df = df.dropna(subset=['text']).reset_index(drop=True)
df["pred"] = "nan"
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,text,label,source,pred
0,en güzel uyuyan insan ödülü jeon jungkook'a gi...,0,https://coltekin.github.io/offensive-turkish/,
1,"mekanı cennet olsun, saygılar sayın avukatımız...",0,https://coltekin.github.io/offensive-turkish/,
2,kızlar aranızda kas yığını beylere düşenler ol...,0,https://coltekin.github.io/offensive-turkish/,
3,biraz ders çalışayım. tembellik ve uyku düşman...,0,https://coltekin.github.io/offensive-turkish/,
4,trezeguet yerine el sharawy daha iyi olmaz mı,0,https://coltekin.github.io/offensive-turkish/,


In [4]:
df.label.value_counts(normalize=True)

0    0.763434
1    0.236566
Name: label, dtype: float64

In [5]:
def assign_split_ids(input_df: pd.DataFrame,
                     fold_name: str,
                     fold_count: int,
                     seed: int):
    """
    Split data for training and evaluation purposes.
    
    ---------
    :param input_df: Competition dataframe with text and labels.
    :param fold_name: Fold column name to be assigned.
    :param fold_count: Split count for cross-validation.
    :return: Competition dataframe with local or private CV folds.
    """
    skf = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=seed)
    splits = list(skf.split(input_df, input_df["label"]))
    input_df[fold_name] = 0
    for split_id, split in enumerate(splits):
        input_df.loc[split[1], fold_name] = split_id
    return input_df

In [6]:
df = assign_split_ids(df,'local_cv',10,123)
#df["target_encoded"] = enc.fit_transform(df["label"].values.reshape(-1,1)).tolist()

In [7]:
from transformers import AutoTokenizer

model_name = "dbmdz/bert-base-turkish-128k-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
    
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=2,
                                                           ignore_mismatched_sizes=True)  

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

In [8]:
X_train = df[df.local_cv != 0]#['text']
X_val = df[df.local_cv == 0]#['text']

#y_train = df[df.local_cv != 0]['label']
#y_val = df[df.local_cv == 0]['label']

In [9]:
train_dataset = Dataset.from_pandas(X_train)
val_dataset = Dataset.from_pandas(X_val)

In [10]:
def encode_dataset(data):
    return tokenizer(data['text'], padding=True, truncation=True,max_length=128)

train_dataset = train_dataset.map(encode_dataset, batched=True)
val_dataset = val_dataset.map(encode_dataset, batched=True)

  0%|          | 0/57 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [11]:
def modify_labels(data):
    data['label'] = np.where(data['label'] == 1, 1, 0)
    return data

train_dataset = train_dataset.map(modify_labels)
val_dataset = val_dataset.map(modify_labels)

  0%|          | 0/56628 [00:00<?, ?ex/s]

  0%|          | 0/6293 [00:00<?, ?ex/s]

In [12]:
training_args = TrainingArguments(
    output_dir=f"teknofest23_v2_{model_name.split('/')[-1]}_fold0",         
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    #evaluation_strategy="steps",
    #eval_steps=100,
    learning_rate=3e-5,
    gradient_accumulation_steps=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,   
    num_train_epochs=10,              
    weight_decay=0.01,               
    push_to_hub=False,               
    #logging_dir='./logs',            
    #logging_steps=500,
    load_best_model_at_end=True,     
    metric_for_best_model="roc_auc",
)

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    return {"roc_auc": roc_auc_score(labels, predictions)}

In [None]:
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, source, pred, local_cv. If text, __index_level_0__, source, pred, local_cv are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 56628
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 17700
  Number of trainable parameters = 184346882
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,0.3163,0.285645,0.927726
2,0.2206,0.306875,0.928025
3,0.1392,0.415569,0.924259


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, source, pred, local_cv. If text, __index_level_0__, source, pred, local_cv are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 6293
  Batch size = 64
Saving model checkpoint to teknofest23_v2_bert-base-turkish-128k-uncased_fold0/checkpoint-1770
Configuration saved in teknofest23_v2_bert-base-turkish-128k-uncased_fold0/checkpoint-1770/config.json
Model weights saved in teknofest23_v2_bert-base-turkish-128k-uncased_fold0/checkpoint-1770/pytorch_model.bin
tokenizer config file saved in teknofest23_v2_bert-base-turkish-128k-uncased_fold0/checkpoint-1770/tokenizer_config.json
Special tokens file saved in teknofest23_v2_bert-base-turkish-128k-uncased_fold0/checkpoint-1770/special_tokens_map.json
The following columns in the evaluat

In [17]:
preds = trainer.predict(val_dataset)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: source, pred, __index_level_0__, text, local_cv. If source, pred, __index_level_0__, text, local_cv are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 6293
  Batch size = 64


In [38]:
pd.Series(np.concatenate(preds.label_ids))#.value_counts()

0       0
1       0
2       0
3       0
4       0
       ..
6288    0
6289    0
6290    0
6291    0
6292    0
Length: 6293, dtype: int64

In [52]:
val_pred_probs = torch.nn.functional.softmax(torch.Tensor(preds.predictions), dim=-1).numpy()

In [56]:
roc_auc_score(X_val['label'],val_pred_probs[:,1])

0.9311952654185089

In [79]:
f1_score(X_val['label'], (val_pred_probs[:,1]>0.3).astype(int))

0.7460937500000001