<a href="https://colab.research.google.com/github/Kwonjihan/ML-teamproject/blob/develop/SeongyeomByeon/Transformer_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, BertConfig
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
class ReviewDataset(Dataset):
    """ 작업 효율을 위해 Dataset 클래스를 상속하여 전용 클래스 정의 """
    def __init__(self, texts, labels, tokenizer, max_length = 512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer # 동적으로 토크나이즈하기 위해 토크나이즈도 받아옴
        self.max_length = max_length # BERT 모델의 입력 길이는 모두 같아야 된다고 함

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus( # 토큰화 + 인코딩
            text,
            None,
            add_special_tokens=True, # 토큰화된 텍스트 앞뒤에 [CLS], [SEP] 추가
            max_length=self.max_length,
            padding='max_length', # max_length보다 짧으면 패딩
            truncation=True, # max_length보다 길면 자름
            return_attention_mask=True, # 위에서 패딩된 부분을 무시할 수 있도록 어텐션 마스크를 반환해줌
            return_tensors='pt'# 파이토치 텐서로 반환
        )
        input_ids = inputs['input_ids'].flatten() # 1차원 텐서로 변경(벡터화)
        attention_mask = inputs['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [14]:
param_grid = {
    'num_attention_heads': [4, 8, 12],  # 트랜스포머 층당 어텐션 헤드의 수
    'attention_probs_dropout_prob': [0.1, 0.2, 0.3],  # 어텐션 계산 후 적용할 드롭아웃 확률
}

In [16]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, len_reviews):
    """ 한 에포크의 train을 수행하고 모델 정확도를 반환하는 함수 """
    model = model.train()
    losses = []
    correct_predictions = 0

    # 배치 단위로 병렬적으로 학습 수행
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss # 현재 배치의 손실 계산

        _, preds = torch.max(outputs.logits, dim=1) # 로짓에서 가장 높은 값을 가진 레이블을 예측 레이블로 설정
        correct_predictions += torch.sum(preds == labels) # 위에서 구한 예측 레이블과 실제 레이블이 몇 개나 같은지 구함

        optimizer.zero_grad() # 이전 배치의 그레디언트 초기화
        loss.backward() # 오류 역전파
        optimizer.step() # 파라미터 업데이트

    return correct_predictions.double() / len_reviews # 맞춘 수 / 총 리뷰 개수

# 데이터 로드
data = pd.read_csv('6000_IMDB_Dataset.csv')
texts = data['review'].tolist()
labels = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).tolist() # 기존 positive / negative 레이블을 1과 0으로 매핑 후 리스트로 변환

# 데이터 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# 토크나이저 및 모델 로드 (하이퍼파라미터 추가)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
models = [BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,
                                                        num_attention_heads=param_grid['num_attention_heads'][i],
                                                        attention_probs_dropout_prob=param_grid['attention_probs_dropout_prob'][j])
                                                        for i in range(3) for j in range(3)]

# 데이터셋 및 데이터 로더 설정
train_dataset = ReviewDataset(train_texts, train_labels, tokenizer)
val_dataset = ReviewDataset(val_texts, val_labels, tokenizer)
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# 학습 설정
device = torch.device("cuda") # GPU로 처리하도록 처리 장치 명시
loss_fn = torch.nn.CrossEntropyLoss().to(device) # 손실함수는 Cross Entropy 사용

# 학습 실행
for model in models:
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    train_accuracy = train_epoch(model, train_data_loader, loss_fn, optimizer, device, len(train_dataset))
    print(f"하이퍼파라미터 : \n- num_attention_heads : {model.config.num_attention_heads}\n- attention_probs_dropout_prob : {model.config.attention_probs_dropout_prob}")
    print(f"정확도: {train_accuracy}\n")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are

하이퍼파라미터 : 
- num_attention_heads : 4
- attention_probs_dropout_prob : 0.1
정확도: 0.8183333333333332

하이퍼파라미터 : 
- num_attention_heads : 4
- attention_probs_dropout_prob : 0.2
정확도: 0.5403703703703704

하이퍼파라미터 : 
- num_attention_heads : 4
- attention_probs_dropout_prob : 0.3
정확도: 0.7775925925925925

하이퍼파라미터 : 
- num_attention_heads : 8
- attention_probs_dropout_prob : 0.1
정확도: 0.8222222222222222

하이퍼파라미터 : 
- num_attention_heads : 8
- attention_probs_dropout_prob : 0.2
정확도: 0.835

하이퍼파라미터 : 
- num_attention_heads : 8
- attention_probs_dropout_prob : 0.3
정확도: 0.8288888888888889

하이퍼파라미터 : 
- num_attention_heads : 12
- attention_probs_dropout_prob : 0.1
정확도: 0.8662962962962962

하이퍼파라미터 : 
- num_attention_heads : 12
- attention_probs_dropout_prob : 0.2
정확도: 0.8581481481481481

하이퍼파라미터 : 
- num_attention_heads : 12
- attention_probs_dropout_prob : 0.3
정확도: 0.8533333333333333

