In [4]:
import pandas as pd
df = pd.read_csv('e-ticaret_urun_yorumlari.csv',sep=';')
df.head()

Unnamed: 0,Metin,Durum
0,evet anlatıldığı gibi,1
1,Daha öncede almıştım bu cihazdan ense ve sakal...,1
2,Ürün gayet başarılı sakal kesmede başlık sayıs...,1
3,Daha öncede aynısını almıştım çok güzel ve kal...,1
4,Erkek kuaförüyüm ense ve sıfır sakal traşı içi...,1


In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Metin"], df["Durum"], test_size=0.2, random_state=42
)

In [7]:
# Tokenizer'ı yükle
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')

# Metinleri token'lara çevirme fonksiyonu
def tokenize_function(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,           # Tüm dizileri aynı uzunluğa getirir
        truncation=True,        # Maksimum uzunluğu aşanları keser
        max_length=128,         # Maksimum token sayısı (ayarlanabilir)
        return_tensors="pt"     # PyTorch tensorları döndürür
    )

# Eğitim ve test verilerini token'ize et
train_encodings = tokenize_function(np.array(train_texts))
test_encodings = tokenize_function(np.array(test_texts))

# Etiketleri tensor'a çevir
train_labels = torch.tensor(train_labels.values)
test_labels = torch.tensor(test_labels.values)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [8]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [9]:
model = BertForSequenceClassification.from_pretrained(
    'dbmdz/bert-base-turkish-cased',
    num_labels=3  # Olumsuz, olumlu, nötr için 3 sınıf
)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# Optimizer'ı tanımla
optimizer = AdamW(model.parameters(), lr=2e-5)  # Önerilen öğrenme oranı

# Eğitim hiperparametreleri
epochs = 4



In [11]:
# Eğitim döngüsü
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        # Batch'i GPU'ya taşı
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass ve optimizasyon
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}")

Epoch 1/4, Average Loss: 0.3209
Epoch 2/4, Average Loss: 0.1773
Epoch 3/4, Average Loss: 0.1248
Epoch 4/4, Average Loss: 0.0893


In [12]:
# Değerlendirme döngüsü
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

# Doğruluğu hesapla
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9173


In [13]:
# Modeli ve tokenizer'ı kaydet
model.save_pretrained("bert_turkish_sentiment_model")
tokenizer.save_pretrained("bert_turkish_sentiment_model")

('bert_turkish_sentiment_model/tokenizer_config.json',
 'bert_turkish_sentiment_model/special_tokens_map.json',
 'bert_turkish_sentiment_model/vocab.txt',
 'bert_turkish_sentiment_model/added_tokens.json')

In [15]:
# Modeli ve tokenizer'ı yükle
model = BertForSequenceClassification.from_pretrained("bert_turkish_sentiment_model")
tokenizer = BertTokenizer.from_pretrained("bert_turkish_sentiment_model")
model.to(device)
model.eval()

# Tahmin fonksiyonu
def predict_sentiment(text):
    encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
    return prediction  # 0: olumsuz, 1: olumlu, 2: nötr

# Örnek tahmin
text = "sevmedim diyemem"
print(predict_sentiment(text))  # Çıktı: 1 (olumlu)

0
