In [None]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertTokenizer, BertModel
from transformers.optimization import get_cosine_schedule_with_warmup
from tqdm.notebook import tqdm
from kobert_tokenizer import KoBERTTokenizer
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

In [2]:
##GPU 사용 시
device = torch.device("cuda:0")

In [3]:
def evaluate_model(model, tokenizer, dataloader, device):
    """
    Evaluate the model using Precision, Recall, and F1-Score.

    Args:
        model: Trained PyTorch model.
        tokenizer: Tokenizer for preprocessing.
        dataloader: DataLoader for evaluation data.
        device: Device to run the model (GPU or CPU).

    Returns:
        None. Prints evaluation metrics.
    """
    model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            # Assuming batch[3] contains labels
            texts, labels = batch[0], batch[3]

            # Ensure texts is a List[str]
            if isinstance(texts, torch.Tensor):
                texts = [str(t) for t in texts]

            # Tokenize input text
            inputs = tokenizer(
                texts,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=128
            )

            # Move inputs and labels to the specified device
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)

            # Get predictions
            outputs = model(**inputs)
            preds = torch.argmax(outputs, dim=1)

            # Collect true and predicted labels
            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(preds.cpu().numpy())

        # Convert multi-dimensional true_labels to single dimension if needed
        if len(true_labels) > 0 and isinstance(true_labels[0], (list, np.ndarray)):
            true_labels = [label.argmax() for label in true_labels]

    # Debugging outputs
    print(f"True Labels (example): {true_labels[:10]}")
    print(f"Predicted Labels (example): {pred_labels[:10]}")
    print(f"Unique classes in true_labels: {set(true_labels)}")
    print(f"Unique classes in pred_labels: {set(pred_labels)}")

    unique_classes = set(true_labels + pred_labels)

    if len(unique_classes) == 1:
        print(f"Warning: Only one class ({list(unique_classes)[0]}) detected.")
        print("Cannot generate complete classification report.")
    else:
        print("Classification Report:")
        print(classification_report(true_labels, pred_labels, target_names=['ham', 'spam'], labels=[0, 1]))
        print(f"Precision: {precision_score(true_labels, pred_labels, average='binary'):.4f}")
        print(f"Recall: {recall_score(true_labels, pred_labels, average='binary'):.4f}")
        print(f"F1-Score: {f1_score(true_labels, pred_labels, average='binary'):.4f}")

In [4]:
# 파일 로드
train_df = pd.read_csv('/home/maroco/dataset/model/train_set.csv')
test_df = pd.read_csv('/home/maroco/dataset/model/test_set.csv')

In [5]:
# 레이블 변환 (예: 'spam' → 1, 'ham' → 0)
label_map = {'spam': 1, 'ham': 0}
train_df['label'] = train_df['label'].map(label_map)
test_df['label'] = test_df['label'].map(label_map)

In [6]:
# 데이터셋 정의
class KoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_len
        )

        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        token_type_ids = inputs['token_type_ids'].squeeze(0)

        return input_ids, attention_mask, token_type_ids, torch.tensor(label, dtype=torch.long)

In [None]:
# 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self, bert):
        super(BERTClassifier, self).__init__()
        self.bert = bert  # Pre-trained BERT 모델
        # 개선된 분류층 추가
        self.classifier = nn.Sequential(
            nn.Linear(bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)  # 이진 분류를 위해 클래스 수는 2
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        # BERT 모델의 출력을 받아 분류층으로 전달
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1]  # [CLS] 토큰의 임베딩
        return self.classifier(pooled_output)

In [8]:
# Hyperparameters
max_len = 128
batch_size = 64
learning_rate = 5e-5
num_epochs = 3

In [9]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("monologg/kobert")
bert_model = BertModel.from_pretrained("monologg/kobert")
model = BERTClassifier(bert_model).to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
# Dataset and DataLoader
train_texts = train_df['text'].tolist()  # 텍스트 열에서 리스트로 변환
train_labels = train_df['label'].tolist()  # 레이블 열에서 리스트로 변환

test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()

# 레이블 인코딩 ('ham'과 'spam'을 각각 0과 1로 변환)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)  # 'ham' -> 0, 'spam' -> 1

# Dataset 생성
train_dataset = KoBERTDataset(
    texts=train_texts,
    labels=train_labels,
    tokenizer=tokenizer,
    max_len=max_len
)

test_dataset = KoBERTDataset(
    texts=test_texts,
    labels=test_labels,
    tokenizer=tokenizer,
    max_len=max_len
)

# DataLoader 생성
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [None]:
# 손실 함수 및 옵티마이저 설정 (클래스 가중치 포함)
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1], y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [None]:
# # Optimizer and Scheduler
# optimizer = AdamW(model.parameters(), lr=learning_rate)
# scheduler = get_cosine_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=len(train_loader) * 2,
#     num_training_steps=len(train_loader) * num_epochs
# )



In [None]:
# Loss function
# loss_fn = nn.CrossEntropyLoss()

In [13]:
# Accuracy calculation
def calc_accuracy(preds, labels):
    _, pred_classes = preds.max(dim=1)
    return (pred_classes == labels).sum().item()

In [14]:
# Training and Validation loop
for epoch in range(num_epochs):
    model.train()
    total_acc, total_loss = 0, 0

    # Training phase
    for input_ids, attention_mask, token_type_ids, labels in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        total_acc += calc_accuracy(outputs, labels)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f} | Train Accuracy: {total_acc / len(train_dataset):.4f}")

    # Validation phase
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids, labels in tqdm(test_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            total_acc += calc_accuracy(outputs, labels)

    print(f"Epoch {epoch+1} | Val Loss: {total_loss / len(test_loader):.4f} | Val Accuracy: {total_acc / len(test_dataset):.4f}")

  0%|          | 0/7414 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 0.0881 | Train Accuracy: 0.9615


  0%|          | 0/1854 [00:00<?, ?it/s]

Epoch 1 | Val Loss: 0.0543 | Val Accuracy: 0.9746


  0%|          | 0/7414 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 0.1291 | Train Accuracy: 0.9439


  0%|          | 0/1854 [00:00<?, ?it/s]

Epoch 2 | Val Loss: 1.0464 | Val Accuracy: 0.2019


  0%|          | 0/7414 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 0.1790 | Train Accuracy: 0.9209


  0%|          | 0/1854 [00:00<?, ?it/s]

Epoch 3 | Val Loss: 1.3240 | Val Accuracy: 0.2019


In [15]:
# 가중치만 저장
torch.save(model.state_dict(), "/home/maroco/dataset/model/sentiment_model.pt")

In [16]:
for batch in test_loader:
    print(f"Batch type: {type(batch)}")
    print(f"Batch content: {batch}")
    break  # 한 배치만 확인

Batch type: <class 'list'>
Batch content: [tensor([[   2,    0,    0,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,    1,    1,    1],
        ...,
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,    0, 5782,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,  365,  365,    3]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), tensor([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
        0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 0, 1, 

In [17]:
all_labels = []
for batch in test_loader:
    labels = batch[3]  # 네 번째 텐서에서 레이블 추출
    all_labels.extend(labels.numpy())

print(f"Unique labels in test dataset: {set(all_labels)}")

Unique labels in test dataset: {0, 1}


In [18]:
# 모델 평가
evaluate_model(model, tokenizer, test_loader, device)

True Labels (example): [0, 1, 1, 1, 1, 1, 0, 1, 0, 1]
Predicted Labels (example): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Unique classes in true_labels: {0, 1}
Unique classes in pred_labels: {0}
Classification Report:
              precision    recall  f1-score   support

         ham       0.20      1.00      0.34     23943
        spam       0.00      0.00      0.00     94671

    accuracy                           0.20    118614
   macro avg       0.10      0.50      0.17    118614
weighted avg       0.04      0.20      0.07    118614

Precision: 0.0000
Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


F1-Score: 0.0000


In [19]:
# GPU 사용 중지 함수
def cleanup_gpu():
    """
    Releases GPU memory and clears PyTorch GPU cache.
    """
    # Synchronize to ensure all GPU operations are complete
    if torch.cuda.is_available():
        torch.cuda.synchronize()  # Waits for all kernels to finish
        torch.cuda.empty_cache()  # Releases all unused cached memory
        print("GPU memory has been released.")
    else:
        print("No GPU detected. No action taken.")

In [20]:
cleanup_gpu()

GPU memory has been released.
