In [4]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [5]:

data_path = r"C:\Users\Melek\yapayZeka\karakterAnaliziProje\İşlenmişVeriler\tum_filmler_etiketli.csv"
df = pd.read_csv(data_path)

In [6]:

label_map = {label:idx for idx, label in enumerate(df['Etiket'].unique())}
df['label'] = df['Etiket'].map(label_map)

In [7]:

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

In [8]:

from torch.utils.data import Dataset  # Bu satırı ekleyin
import torch

class FilmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Hyperparameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Türkçe için alternatif model
model_name = "ytu-ce-cosmos/turkish-base-bert-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ytu-ce-cosmos/turkish-base-bert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [11]:

from torch.utils.data import DataLoader  
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = FilmDataset(
        texts=df['Processed_Sentence'].values,
        labels=df['label'].values,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size)

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [12]:
from torch.optim import Adam
optimizer = Adam(model.parameters(), lr=2e-5)

In [13]:

def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    losses = []
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    return np.mean(losses)

In [14]:

def eval_model(model, data_loader, device):
    model = model.eval()
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
    
    return classification_report(actual_labels, predictions, target_names=label_map.keys()), accuracy_score(actual_labels, predictions)

In [15]:
# Eğitim döngüsü
from tqdm import tqdm
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss = train_epoch(model, train_data_loader, optimizer, device)
    print(f'Train loss: {train_loss}')
    
    report, acc = eval_model(model, test_data_loader+, device)
    print(f'Test Accuracy: {acc:.4f}')
    print(report)

Epoch 1/3


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Training: 100%|██████████| 3390/3390 [42:32<00:00,  1.33it/s]


Train loss: 2.733259539829243


Evaluating: 100%|██████████| 848/848 [04:20<00:00,  3.26it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Accuracy: 0.2184
              precision    recall  f1-score   support

   distoptik       0.23      0.01      0.03       659
       müzik       0.42      0.18      0.25       708
   animasyon       0.80      0.31      0.45       843
       savaş       0.12      0.03      0.04       613
     gerilim       0.13      0.13      0.13       883
         suç       0.14      0.38      0.21      1086
 bilim kurgu       0.19      0.50      0.28      1163
     aksiyon       0.00      0.00      0.00       542
    romantik       0.29      0.16      0.20      1060
        dram       0.00      0.00      0.00       814
        spor       0.71      0.43      0.53       894
       tarih       0.20      0.28      0.23       962
    polisiye       0.14      0.06      0.08       834
   fantastik       0.22      0.24      0.23       842
      komedi       0.17      0.37      0.23      1044
       korku       0.00      0.00      0.00       612

    accuracy                           0.22     13559
   m

Training: 100%|██████████| 3390/3390 [41:31<00:00,  1.36it/s]


Train loss: 2.3419695844340818


Evaluating: 100%|██████████| 848/848 [04:21<00:00,  3.25it/s]


Test Accuracy: 0.3087
              precision    recall  f1-score   support

   distoptik       0.31      0.14      0.19       659
       müzik       0.46      0.25      0.32       708
   animasyon       0.57      0.45      0.50       843
       savaş       0.38      0.17      0.24       613
     gerilim       0.27      0.24      0.26       883
         suç       0.18      0.47      0.26      1086
 bilim kurgu       0.26      0.53      0.35      1163
     aksiyon       0.41      0.06      0.10       542
    romantik       0.38      0.31      0.34      1060
        dram       0.33      0.12      0.18       814
        spor       0.63      0.48      0.54       894
       tarih       0.39      0.29      0.33       962
    polisiye       0.22      0.18      0.20       834
   fantastik       0.33      0.32      0.33       842
      komedi       0.27      0.39      0.32      1044
       korku       0.45      0.13      0.20       612

    accuracy                           0.31     13559
   m

Training: 100%|██████████| 3390/3390 [41:23<00:00,  1.36it/s]


Train loss: 2.4058269057653647


Evaluating: 100%|██████████| 848/848 [04:24<00:00,  3.21it/s]

Test Accuracy: 0.0858
              precision    recall  f1-score   support

   distoptik       0.00      0.00      0.00       659
       müzik       0.00      0.00      0.00       708
   animasyon       0.00      0.00      0.00       843
       savaş       0.00      0.00      0.00       613
     gerilim       0.00      0.00      0.00       883
         suç       0.00      0.00      0.00      1086
 bilim kurgu       0.09      1.00      0.16      1163
     aksiyon       0.00      0.00      0.00       542
    romantik       0.00      0.00      0.00      1060
        dram       0.00      0.00      0.00       814
        spor       0.00      0.00      0.00       894
       tarih       0.00      0.00      0.00       962
    polisiye       0.00      0.00      0.00       834
   fantastik       0.00      0.00      0.00       842
      komedi       0.00      0.00      0.00      1044
       korku       0.00      0.00      0.00       612

    accuracy                           0.09     13559
   m


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
