In [None]:
!pip install pandas
!pip install transformers
!pip install torch
!pip install torchvision

from google.colab import drive
drive.mount('/content/drive')

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
import os
import torch.optim as optim

# Veri yolları
fake_news_dir = '/content/drive/MyDrive/collab1/SON_DATASET/random10_90(960-8756)/90-train/fake'
real_news_dir = '/content/drive/MyDrive/collab1/SON_DATASET/random10_90(960-8756)/90-train/real'

# Veriyi okuma ve etiketleme kısmı
texts = []
labels = []

for filename in os.listdir(fake_news_dir):
    with open(os.path.join(fake_news_dir, filename), 'r', encoding='utf-8') as f:
        texts.append(f.read())
        labels.append(0)  # Fake haberler için 0 etiketi

for filename in os.listdir(real_news_dir):
    with open(os.path.join(real_news_dir, filename), 'r', encoding='utf-8') as f:
        texts.append(f.read())
        labels.append(1)  # Gerçek haberler için 1 etiketi

# DistilBERT tokenizer ve modelini yükleme
tokenizer = DistilBertTokenizer.from_pretrained('dbmdz/distilbert-base-turkish-cased', truncation=True, padding=True)
model = DistilBertForSequenceClassification.from_pretrained('dbmdz/distilbert-base-turkish-cased', num_labels=2)

# Veriyi DistilBERT giriş formatına dönüştürme
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
labels_tensor = torch.tensor(labels)

# Veriyi TensorDataset'e dönüştürme
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels_tensor)

# Veriyi DataLoader'a yükleme
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Modeli eğitme
print("model eğitim")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    average_loss = total_loss / len(dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}')

# Modeli kaydetme
model_save_path = '/content/drive/MyDrive/collab1/bertdataset_teslim4'
model.save_pretrained(model_save_path)
'''model eğitim
Epoch 1/3, Average Loss: 0.10400852958361308
Epoch 2/3, Average Loss: 0.014659038514481458
Epoch 3/3, Average Loss: 0.010205479700011457'''
print("Eğitim tamamlandı.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/410 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/273M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/distilbert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model eğitim
Epoch 1/3, Average Loss: 0.10400852958361308
Epoch 2/3, Average Loss: 0.014659038514481458
Epoch 3/3, Average Loss: 0.010205479700011457
Eğitim tamamlandı.
