<a href="https://colab.research.google.com/github/JangJoonHa/Project_2/blob/main/FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import torch.nn.functional as F  # 확률 계산을 위한 임포트

# 1. 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. 데이터셋 로드 및 샘플링
print("Loading dataset...")
dataset = load_dataset("imdb")
train_dataset = dataset['train'].shuffle(seed=42).select(range(100))  # 훈련 데이터 1000개

test_dataset = dataset['test'].shuffle(seed=42).select(range(50))   # 테스트 데이터 500개

# 3. 모델과 토크나이저 초기화
print("Initializing model and tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)

# 4. 데이터 전처리 함수 및 데이터셋 변환
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.rename_column("label", "labels").with_format("torch")
test_dataset = test_dataset.rename_column("label", "labels").with_format("torch")

# 5. DataLoader 준비
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item['input_ids'] for item in batch]),
        'attention_mask': torch.stack([item['attention_mask'] for item in batch]),
        'labels': torch.stack([item['labels'] for item in batch])
    }

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn, num_workers=2)

# 6. 옵티마이저와 학습률 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

# 7. 훈련 루프
def train_model(model, train_loader, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            batch = {key: val.to(device) for key, val in batch.items()}

            # 모델 학습
            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}")

# 8. 검증 루프
def evaluate_model(model, test_loader):
    model.eval()
    predictions, labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())

    accuracy = accuracy_score(labels, predictions)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

# 9. 훈련 및 평가
print("Training model...")
train_model(model, train_loader, optimizer, num_epochs=3)
print("Evaluating model...")
evaluate_model(model, test_loader)

# 10. 예측 함수
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probs, dim=-1).item()
        sentiment = "positive" if prediction == 1 else "negative"
        return sentiment, probs.cpu().numpy()

# 11. 예시 문장 테스트
example_text_positive = "The movie was an incredible experience, with a captivating storyline and beautiful performances by the cast."
example_text_negative = "I couldn't stand the movie, it was slow, boring, and lacked any real character development."

print("\nPredicting example sentences...")
for text in [example_text_positive, example_text_negative]:
    sentiment, probs = predict_sentiment(text)
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}, Probabilities: {probs}\n")


Loading dataset...
Initializing model and tokenizer...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Training model...




Epoch 1/3, Loss: 0.6920
Epoch 2/3, Loss: 0.6111
Epoch 3/3, Loss: 0.4422
Evaluating model...
Test Accuracy: 62.00%

Predicting example sentences...
Text: The movie was an incredible experience, with a captivating storyline and beautiful performances by the cast.
Predicted Sentiment: positive, Probabilities: [[0.17231593 0.8276841 ]]

Text: I couldn't stand the movie, it was slow, boring, and lacked any real character development.
Predicted Sentiment: negative, Probabilities: [[0.6052498  0.39475012]]

