In [48]:
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset, load_metric


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv('output v6.csv')

label_mapping = {
    None: 0,
    "positive": 1,
    "negative": 2,
    "positive|negative": 3,
}

# Etiketleri güncelle
df['label'] = df['value'].map({
    1: 'positive',
    2: 'negative',
    3: 'positive|negative',
})

# Sadece gerekli sütunları tutun
df = df[['text', 'label']]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [10]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', do_lower_case=False)

In [49]:
model = BertForSequenceClassification.from_pretrained(
    "dbmdz/bert-base-turkish-128k-uncased",
    num_labels = len(label_mapping),
    # output_attentions = False,
    # output_hidden_states = False,
)

"""
NOTE:When we are retrain our model we should use that. (Un-comment block is should work and enough.)

# Load the state dict
state_dict = torch.load('./results/checkpoint-final/pytorch_model.bin', map_location=device)
model.load_state_dict(state_dict)
"""

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [38]:
from datasets import Dataset

# Dataset oluşturma
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [39]:
print(X_train.head())
print(X_test.head())

13244    Telefon faturamı incelediğimde iletişim ücretl...
5421     Türk Telekom alt yapısındaki Pttcell müşterisi...
15258                              Rezaletin bini bir para
29066                                     yönlendiriyorlar
28667                              asansörde mahsur kaldık
Name: text, dtype: object
13066    #vakifbank kredi başvurum hala sonuçlanmadı. A...
27521                                boş yere açık kalıyor
3842                               beklediğimden çok geldi
2767                                    borcunuz yok diyor
19632    Turk Telekom ev adresimi ev ve cep telefonu bi...
Name: text, dtype: object


In [40]:
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test, 'label': y_test}))

In [41]:
def tokenize_function(examples):
    # Tokenize the text
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply the tokenization function
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/24461 [00:00<?, ? examples/s]

Map:   0%|          | 0/6116 [00:00<?, ? examples/s]

In [42]:
def add_labels(example):
    example['labels'] = label_mapping[example['label']]
    return example

train_dataset = train_dataset.map(add_labels)
test_dataset = test_dataset.map(add_labels)

# Remove the 'label' column since we now have 'labels'
train_dataset = train_dataset.remove_columns(['label'])
test_dataset = test_dataset.remove_columns(['label'])

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/24461 [00:00<?, ? examples/s]

Map:   0%|          | 0/6116 [00:00<?, ? examples/s]

In [60]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer
import torch
from torch.utils.data import DataLoader

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)


In [61]:
# Resume training
trainer.train()
# trainer.train(resume_from_checkpoint='./results/checkpoint-final)

  0%|          | 0/9174 [00:00<?, ?it/s]

{'loss': 0.3427, 'grad_norm': 0.9962624311447144, 'learning_rate': 4.9945498146937e-05, 'epoch': 0.0}
{'loss': 0.8495, 'grad_norm': 11.76801872253418, 'learning_rate': 4.99018966644866e-05, 'epoch': 0.01}


KeyboardInterrupt: 

: 

In [None]:
# Save model and optimizer state with contiguous tensors
def save_model_with_contiguous_tensors(model, output_dir):
    state_dict = model.state_dict()
    contiguous_state_dict = {k: v.contiguous().to('cpu') for k, v in state_dict.items()}
    torch.save(contiguous_state_dict, f"{output_dir}/pytorch_model.bin")


In [None]:
# Save the model and optimizer state
save_model_with_contiguous_tensors(model, './results/checkpoint-final')
trainer.state.save_to_json('./results/checkpoint-final/trainer_state.json')


In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Make predictions
predictions = trainer.predict(test_dataset)

In [None]:
# Load the accuracy metric
accuracy_metric = load_metric("accuracy")

# Calculate accuracy
predictions_labels = predictions.predictions.argmax(-1)
accuracy = accuracy_metric.compute(predictions=predictions_labels, references=predictions.label_ids)
print(f"Accuracy: {accuracy['accuracy']}")

In [None]:
# Load F1 Metric
f1_metric = load_metric("f1")

# Calculate F1 Score
f1 = f1_metric.compute(predictions=predictions_labels, references=predictions.label_ids, average='weighted')
print(f"F1 Score: {f1['f1']}")

In [None]:
with open('results.txt', 'w') as file:
    file.write(f'Accuracy: {accuracy["accuracy"]}\n')
    file.write(f'F1 Score: {f1["f1"]}\n')
    file.write(f'Evaluation results: {eval_results}\n')
