<a href="https://colab.research.google.com/github/Hayeonggg/CAU-IIPL-2024-Internship/blob/main/BERT_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets torch

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# 데이터셋 로드
dataset = load_dataset('imdb')

# 사전 학습된 BERT 모델과 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
#데이터 전처리
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets.set_format('torch')


In [None]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score
from torch.nn.functional import cross_entropy

#훈련 및 평가 설정
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    logging_strategy='steps',
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='wandb',  #W&B사용
    run_name='BERT-IMDB-2'
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    accuracy = accuracy_score(p.label_ids, preds)

    logits = torch.tensor(p.predictions)
    labels = torch.tensor(p.label_ids)
    loss = cross_entropy(logits, labels).item()
    return {'accuracy': accuracy, 'loss': loss}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)


In [None]:
import wandb
wandb.login(key="b6cf03e168c0b825212bdd527b6dd76b60cff046")
wandb.init(
    project='huggingface',
    entity='bluebarry37',
    name='BERT-IMDB-2',
    config={
        "learning_rate": 2e-5,
        "batch_size": 8,
        "epochs": 3
    })


In [None]:
trainer.train()

In [None]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score
from torch.nn.functional import cross_entropy
from transformers import Trainer, TrainingArguments

# 훈련 및 평가 설정
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',  # 매 epoch마다 평가
    logging_strategy='steps',
    logging_steps=50,  # 매 50 스텝마다 로깅
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='wandb',  # W&B 사용
    run_name='BERT-IMDB-2'
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    accuracy = accuracy_score(p.label_ids, preds)

    logits = torch.tensor(p.predictions)
    labels = torch.tensor(p.label_ids)
    loss = cross_entropy(logits, labels).item()

    return {'val_accuracy': accuracy, 'val_loss': loss}  # val_ 접두사를 사용해 validation 임을 구분

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# 훈련 및 로깅 추가
train_results = trainer.train()

# Train accuracy와 loss를 계산 후 W&B에 기록
train_loss = train_results.training_loss
wandb.log({"train_loss": train_loss})

# 평가 수행
eval_results = trainer.evaluate()

# 평가 결과를 W&B에 기록
wandb.log({"val_accuracy": eval_results['eval_accuracy'], "val_loss": eval_results['eval_loss']})


In [None]:
# Train/Validation 손실 및 정확도 확인
train_results = trainer.evaluate(eval_dataset=tokenized_datasets['train'])
test_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'])

In [None]:
print(f"Train Accuracy: {train_results['eval_accuracy']}")
print(f"Train Loss: {train_results['eval_loss']}")
print(f"Test Accuracy: {test_results['eval_accuracy']}")
print(f"Test Loss: {test_results['eval_loss']}")

In [None]:
#예측
def sentiment_predict(new_sentence):
  # 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
    new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()
    encoded = []

    # 띄어쓰기 단위 토큰화 후 정수 인코딩
    for word in new_sentence.split():
        try:
            # 단어 집합의 크기를 10,000으로 제한.
            if word_to_index[word] <= 10000:
                encoded.append(word_to_index[word]+3)
            else:
              # 10,000 이상의 숫자는 <unk> 토큰으로 변환.
                encoded.append(2)
         # 단어 집합에 없는 단어는 <unk> 토큰으로 변환.
         except KeyError:
            encoded.append(2)

pad_sequence = pad_sequences([encoded], maxlen=max_len)
score = float(loaded_model.predict(pad_sequence)) # 예측

if(score > 0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.".format(score * 100))
else:
    print("{:.2f}% 확률로 부정 리뷰입니다.".format((1 - score) * 100))


In [None]:
#부정 예상리뷰
test_input = "This movie was just way too overrated. The fighting was not professional and in slow motion. I was expecting more from a 200 million budget movie. The little sister of T.Challa was just trying too hard to be funny. The story was really dumb as well. Don't watch this movie if you are going because others say its great unless you are a Black Panther fan or Marvels fan."

sentiment_predict(test_input)


In [None]:
#긍정 예상리뷰
test_input = " I was lucky enough to be included in the group to see the advanced screening in Melbourne on the 15th of April, 2012. And, firstly, I need to say a big thank-you to Disney and Marvel Studios. \
Now, the film... how can I even begin to explain how I feel about this film? It is, as the title of this review says a 'comic book triumph'. I went into the film with very, very high expectations and I was not disappointed. \
Seeing Joss Whedon's direction and envisioning of the film come to life on the big screen is perfect. The script is amazingly detailed and laced with sharp wit a humor. The special effects are literally mind-blowing and the action scenes are both hard-hitting and beautifully choreographed."

sentiment_predict(test_input)
