In [None]:
!pip install torch
!pip install transformers
!pip install datasets
!pip install tqdm

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from datasets import load_dataset
from tqdm import tqdm

# Încarcă un subset și mai mic al setului de date IMDB
dataset = load_dataset("imdb", split={'train': 'train[:2000]', 'test': 'test[:10000]'})

# Inițializează tokenizer-ul DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Funcția de tokenizare
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)

# Tokenizează setul de date
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4)

# Pregătește setul de date pentru PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Creează data loaders
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=64)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=64)

# Inițializează modelul
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Configurează optimizatorul
optimizer = AdamW(model.parameters(), lr=5e-4)

# Funcția de pierdere
loss_function = torch.nn.CrossEntropyLoss()

# Bucla de antrenare
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop('label')
        outputs = model(**batch)
        loss = loss_function(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Evaluare finală
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(eval_dataloader, desc="Final Evaluation"):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop('label')
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

final_accuracy = correct / total
print(f"Final Accuracy: {final_accuracy:.4f}")

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 32/32 [09:55<00:00, 18.61s/it]
Final Evaluation: 100%|██████████| 4/4 [00:17<00:00,  4.44s/it]

Final Accuracy: 1.0000



