In [1]:
%%capture
!pip install transformers datasets evaluate accelerate
!pip install -U bitsandbytes

# Loading the Dataset

In [2]:
from datasets import Dataset, concatenate_datasets
import pandas as pd
import os

def load_data_from_dir(dir_path, label):
    data = []
    for filename in sorted(os.listdir(dir_path)):
        file_path = os.path.join(dir_path, filename)
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='ISO-8859-9') as f:
                text = f.read().strip()
                
        data.append({'sentence': text, 'label': label})
    return data

neg_dir   = "/kaggle/input/neg-pos-financial-sentiment-analysis/sorted_news/neg"
pos_dir   = "/kaggle/input/neg-pos-financial-sentiment-analysis/sorted_news/pos"
neg_data  = load_data_from_dir(neg_dir, 0)
pos_data  = load_data_from_dir(pos_dir, 1)
all_data  = neg_data + pos_data

seed = 35
full_dataset = Dataset.from_pandas(pd.DataFrame(all_data))
full_dataset = full_dataset.shuffle(seed = seed)

print(full_dataset[2:5])
# for i in full_dataset:
#    print(i)

{'sentence': ['Cuma günü tarihi zirvesini 4.552,44 seviyesine taşıyan BIST-100 Endeksi kapanışa doğru etkili olan kâr satışlarıyla gün içi kazançlarını geri verdi. Yaşanan güçlü yükselişlerin ardından son günlerde endekste ve ana hisselerde gözlenen yorulma emareleri ve teknik indikatörlerdeki negatif uyuşmazlıklar olası bir düzeltme ihtimalini artırıyor. Endekste kısa vadede 4.400 seviyesi kısa vadeli destek olarak izlenecek olup, bu seviye altında 4.333 \x96 4.297 \x96 4.234 - 4.150 ve 4.100 seviyeleri destek olarak takip edilebilir. Endekste kısa vadede 4.400 üzerinde kalıcılığın korunması yükselişlerin devamlılığı açısından önem taşımaktadır. Endekste 4.400 üzerindeki tutunmanın korunması ve 4.500 üzerinde kapanışların yaşanması durumunda ise 4.552 \x96 4.575 ve 4.600 seviyeleri direnç konumunda bulunmaktadır.', 'Haftaya global eğilime paralel sınırlı zayıf eğilimle başlangıç beklediğimiz  endekste ilk aşamada 3500-3490 destek, 3610-3630 direnç bölgelerinin son 6 işlem gününde  olu

In [3]:
train_test_split = full_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['sentence', 'label'],
    num_rows: 1364
})
Dataset({
    features: ['sentence', 'label'],
    num_rows: 342
})


# Loading Tokenizer

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]



# Tokenize Dataset

In [5]:
def tokenize_function(example):
    return tokenizer(example['sentence'], padding='max_length', truncation=True, max_length=256)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1364 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

In [6]:
print(tokenized_train_dataset[2]['input_ids'])
print(tokenized_test_dataset[23]['attention_mask'])

[2, 7111, 5064, 10643, 3077, 12514, 16777, 2983, 24337, 11427, 5235, 3626, 9623, 7443, 12350, 2353, 3498, 2107, 9373, 22565, 9168, 2959, 12620, 16, 2497, 3077, 4566, 21033, 23782, 2046, 2054, 1996, 4687, 12806, 1991, 7079, 11603, 18, 7111, 2579, 2342, 23990, 6717, 3282, 2665, 4380, 6095, 25938, 1006, 3617, 3508, 6717, 6036, 4395, 2903, 7347, 18, 2999, 11844, 9386, 2054, 5399, 12806, 1991, 1996, 7079, 3869, 18, 4638, 8400, 28841, 16777, 12605, 19400, 3283, 7761, 16, 7200, 1992, 18592, 9362, 28841, 2171, 11427, 1996, 7079, 16413, 18, 2123, 4581, 10380, 7150, 23782, 2185, 2054, 23311, 6036, 6851, 3180, 10489, 18, 21774, 5033, 2072, 2012, 5399, 3705, 12441, 6851, 4674, 10156, 2040, 12291, 1973, 26345, 3926, 10457, 18, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation Metrics

In [13]:
import evaluate
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    return {
        'accuracy': accuracy_score['accuracy'],
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


In [26]:
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    'dbmdz/bert-base-turkish-cased',
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1}
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluate Before

In [27]:
trainer = Trainer(
    model=model,
    tokenizer=new_tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_test_dataset
)

results = trainer.evaluate()
print(results)

{'eval_loss': 0.7298983931541443, 'eval_model_preparation_time': 0.0068, 'eval_accuracy': 0.4269005847953216, 'eval_precision': 0.7563067002795355, 'eval_recall': 0.4269005847953216, 'eval_f1': 0.2588085752478215, 'eval_runtime': 141.3316, 'eval_samples_per_second': 2.42, 'eval_steps_per_second': 0.304}


# Training

In [19]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="Finansal_Haber_Classification",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [22]:
for param in model.parameters():
    param.data = param.data.contiguous()

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6945,0.463401,0.868421,0.87303,0.868421,0.869053
2,0.517,0.456946,0.903509,0.903897,0.903509,0.903633


TrainOutput(global_step=1364, training_loss=0.5468247153542258, metrics={'train_runtime': 4763.9148, 'train_samples_per_second': 0.573, 'train_steps_per_second': 0.286, 'total_flos': 358883479511040.0, 'train_loss': 0.5468247153542258, 'epoch': 2.0})

# New Model Evaluation

In [23]:
new_tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/Finansal_Haber_Classification/checkpoint-1364')

new_model = AutoModelForSequenceClassification.from_pretrained(
    '/kaggle/working/Finansal_Haber_Classification/checkpoint-1364',
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1}
)

In [25]:
accuracy = evaluate.load("accuracy")

trainer = Trainer(
    model=new_model,
    tokenizer=new_tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_test_dataset
)

# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.4569464921951294, 'eval_model_preparation_time': 0.0061, 'eval_accuracy': 0.9035087719298246, 'eval_precision': 0.9038973862630943, 'eval_recall': 0.9035087719298246, 'eval_f1': 0.9036325833744124, 'eval_runtime': 141.0362, 'eval_samples_per_second': 2.425, 'eval_steps_per_second': 0.305}
