In [20]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertTokenizer, BertModel, get_scheduler
from transformers.optimization import get_cosine_schedule_with_warmup
from tqdm.notebook import tqdm
from kobert_tokenizer import KoBERTTokenizer
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

In [42]:
##GPU 사용 시
device = torch.device("cuda:0")

In [22]:
def evaluate_model(model, tokenizer, dataloader, device):
    """
    Evaluate the model using Precision, Recall, and F1-Score.

    Args:
        model: Trained PyTorch model.
        tokenizer: Tokenizer for preprocessing.
        dataloader: DataLoader for evaluation data.
        device: Device to run the model (GPU or CPU).

    Returns:
        None. Prints evaluation metrics.
    """
    model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            # Assuming batch[3] contains labels
            texts, labels = batch[0], batch[3]

            # Ensure texts is a List[str]
            if isinstance(texts, torch.Tensor):
                texts = [str(t) for t in texts]

            # Tokenize input text
            inputs = tokenizer(
                texts,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=128
            )

            # Move inputs and labels to the specified device
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)

            # Get predictions
            outputs = model(**inputs)
            preds = torch.argmax(outputs, dim=1)

            # Collect true and predicted labels
            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(preds.cpu().numpy())

        # Convert multi-dimensional true_labels to single dimension if needed
        if len(true_labels) > 0 and isinstance(true_labels[0], (list, np.ndarray)):
            true_labels = [label.argmax() for label in true_labels]

    # Debugging outputs
    print(f"True Labels (example): {true_labels[:10]}")
    print(f"Predicted Labels (example): {pred_labels[:10]}")
    print(f"Unique classes in true_labels: {set(true_labels)}")
    print(f"Unique classes in pred_labels: {set(pred_labels)}")

    unique_classes = set(true_labels + pred_labels)

    if len(unique_classes) == 1:
        print(f"Warning: Only one class ({list(unique_classes)[0]}) detected.")
        print("Cannot generate complete classification report.")
    else:
        print("Classification Report:")
        print(classification_report(true_labels, pred_labels, target_names=['ham', 'spam'], labels=[0, 1]))
        print(f"Precision: {precision_score(true_labels, pred_labels, average='binary'):.4f}")
        print(f"Recall: {recall_score(true_labels, pred_labels, average='binary'):.4f}")
        print(f"F1-Score: {f1_score(true_labels, pred_labels, average='binary'):.4f}")

In [23]:
# 파일 로드
train_df = pd.read_csv('/home/maroco/dataset/model/train_set.csv')
test_df = pd.read_csv('/home/maroco/dataset/model/test_set.csv')

In [24]:
# 레이블 변환 (예: 'spam' → 1, 'ham' → 0)
label_map = {'spam': 1, 'ham': 0}
train_df['label'] = train_df['label'].map(label_map)
test_df['label'] = test_df['label'].map(label_map)

In [25]:
# 데이터셋 정의
class KoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_len
        )

        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        token_type_ids = inputs['token_type_ids'].squeeze(0)

        return input_ids, attention_mask, token_type_ids, torch.tensor(label, dtype=torch.long)

In [26]:
# 모델 정의
class BERTClassifier(nn.Module):
    def __init__(self, bert):
        super(BERTClassifier, self).__init__()
        self.bert = bert  # Pre-trained BERT 모델
        self.classifier = nn.Sequential(
            nn.Linear(bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)  # 이진 분류
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output  # [CLS] 토큰의 임베딩
        return self.classifier(pooled_output)

In [27]:
# Hyperparameters
max_len = 128
batch_size = 64
learning_rate = 1e-5
num_epochs = 3

In [28]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("monologg/kobert")
bert_model = BertModel.from_pretrained("monologg/kobert")
model = BERTClassifier(bert_model).to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [29]:
# Dataset and DataLoader
train_texts = train_df['text'].tolist()  # 텍스트 열에서 리스트로 변환
train_labels = train_df['label'].tolist()  # 레이블 열에서 리스트로 변환

test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()

# 레이블 인코딩 ('ham'과 'spam'을 각각 0과 1로 변환)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)  # 'ham' -> 0, 'spam' -> 1

# Dataset 생성
train_dataset = KoBERTDataset(
    texts=train_texts,
    labels=train_labels,
    tokenizer=tokenizer,
    max_len=max_len
)

test_dataset = KoBERTDataset(
    texts=test_texts,
    labels=test_labels,
    tokenizer=tokenizer,
    max_len=max_len
)

# DataLoader 생성
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [30]:
# 손실 함수 및 옵티마이저 설정 (클래스 가중치 포함)
from sklearn.utils.class_weight import compute_class_weight

# 클래스 가중치 계산
classes = np.unique(y_train)  # [0, 1]을 numpy 배열로 변환
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# 손실 함수 정의
# loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()

# 손실 함수로 Focal Loss 사용
loss_fn = FocalLoss(alpha=2.0, gamma=2.0)

optimizer = AdamW(model.parameters(), lr=1e-5)



In [31]:
# scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
steps_per_epoch = len(train_loader)
total_steps = steps_per_epoch * num_epochs  # 전체 학습 스텝 수
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [32]:
# # Optimizer and Scheduler
# optimizer = AdamW(model.parameters(), lr=learning_rate)
# scheduler = get_cosine_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=len(train_loader) * 2,
#     num_training_steps=len(train_loader) * num_epochs
# )

In [33]:
# Loss function
# loss_fn = nn.CrossEntropyLoss()

In [34]:
# Accuracy calculation
def calc_accuracy(preds, labels):
    _, pred_classes = preds.max(dim=1)
    return (pred_classes == labels).sum().item()

In [35]:
# Training and Validation loop
for epoch in range(num_epochs):
    model.train()
    total_acc, total_loss = 0, 0

    # Training phase
    for batch in train_loader:
        optimizer.zero_grad()

        # Extract inputs and labels
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        labels = batch[3].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        total_acc += calc_accuracy(outputs, labels)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f} | Train Accuracy: {total_acc / len(train_dataset):.4f}")

    # Validation phase
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            # Extract inputs and labels
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            labels = batch[3].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            total_acc += calc_accuracy(outputs, labels)

        print(f"Epoch {epoch+1} | Val Loss: {total_loss / len(test_loader):.4f} | Val Accuracy: {total_acc / len(test_dataset):.4f}")

# Debugging logits
for batch in test_loader:
    texts, labels = batch[0], batch[3]

    # Ensure texts is a list of strings
    if isinstance(texts, torch.Tensor):
        texts = [str(t) for t in texts]

    # Tokenizer conversion
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )

    # Move inputs to GPU
    inputs = {key: value.to(device) for key, value in inputs.items()}
    labels = labels.to(device)

    # Model prediction
    outputs = model(**inputs)
    print("Logits:", outputs)

Epoch 1 | Train Loss: 0.0748 | Train Accuracy: 0.9348
Epoch 1 | Val Loss: 0.0578 | Val Accuracy: 0.9478
Epoch 2 | Train Loss: 0.0586 | Train Accuracy: 0.9460
Epoch 2 | Val Loss: 0.0557 | Val Accuracy: 0.9486
Epoch 3 | Train Loss: 0.0557 | Train Accuracy: 0.9477
Epoch 3 | Val Loss: 0.0549 | Val Accuracy: 0.9490
Logits: tensor([[-0.9629,  0.7249],
        [ 0.1181, -0.1825],
        [-1.2809,  0.9747],
        [ 0.1652, -0.2290],
        [-0.6960,  0.5169],
        [ 0.0541, -0.1188],
        [-0.0022, -0.0618],
        [-0.8083,  0.6054],
        [-1.4204,  1.0698],
        [-1.5072,  1.1291],
        [-1.1616,  0.8899],
        [-0.6003,  0.4433],
        [ 0.0936, -0.1564],
        [ 0.1652, -0.2290],
        [-1.2213,  0.9301],
        [ 0.2107, -0.2727],
        [ 0.1773, -0.2406],
        [-1.4810,  1.1111],
        [-1.3884,  1.0452],
        [-1.4007,  1.0556],
        [ 0.0541, -0.1188],
        [ 0.0747, -0.1370],
        [ 0.3664, -0.4301],
        [ 0.3774, -0.4397],
        

In [36]:
# 가중치만 저장
torch.save(model.state_dict(), "/home/maroco/dataset/model/sentiment_model.pt")

In [37]:
for batch in test_loader:
    print(f"Batch type: {type(batch)}")
    print(f"Batch content: {batch}")
    break  # 한 배치만 확인

Batch type: <class 'list'>
Batch content: [tensor([[   2,    0,    0,  ...,    0,    0,    3],
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,    0,    0,    3],
        ...,
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,    0, 7028,  ...,    1,    1,    1]]), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), tensor([1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
        1, 0, 1, 1, 1, 0, 1, 1, 

In [38]:
all_labels = []
for batch in test_loader:
    labels = batch[3]  # 네 번째 텐서에서 레이블 추출
    all_labels.extend(labels.numpy())

print(f"Unique labels in test dataset: {set(all_labels)}")

Unique labels in test dataset: {0, 1}


In [39]:
# 모델 평가
evaluate_model(model, tokenizer, test_loader, device)

True Labels (example): [1, 0, 1, 0, 1, 0, 1, 1, 1, 1]
Predicted Labels (example): [1, 0, 1, 0, 1, 0, 0, 1, 1, 1]
Unique classes in true_labels: {0, 1}
Unique classes in pred_labels: {0, 1}
Classification Report:
              precision    recall  f1-score   support

         ham       0.76      0.98      0.86     23943
        spam       0.98      0.74      0.84     28521

    accuracy                           0.85     52464
   macro avg       0.87      0.86      0.85     52464
weighted avg       0.88      0.85      0.85     52464

Precision: 0.9751
Recall: 0.7424
F1-Score: 0.8430


In [40]:
# GPU 사용 중지 함수
def cleanup_gpu():
    """
    Releases GPU memory and clears PyTorch GPU cache.
    """
    # Synchronize to ensure all GPU operations are complete
    if torch.cuda.is_available():
        torch.cuda.synchronize()  # Waits for all kernels to finish
        torch.cuda.empty_cache()  # Releases all unused cached memory
        print("GPU memory has been released.")
    else:
        print("No GPU detected. No action taken.")

In [41]:
cleanup_gpu()

GPU memory has been released.
