<a href="https://colab.research.google.com/github/Kwonjihan/ML-teamproject/blob/develop/SeongYeomByeon/Bert_pretraining_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

In [2]:
import math
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import random
import torch.optim as optim
from torch.utils.data import RandomSampler, DataLoader, random_split
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, DataCollatorForLanguageModeling
from typing import Optional, Tuple
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from transformers.modeling_outputs import ModelOutput, MaskedLMOutput

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# activation function 불러오기
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": torch.nn.functional.silu}

In [5]:
class Config:
    vocab_size = 30522
    hidden_size = 512
    num_hidden_layers = 8
    num_attention_heads = 8
    intermediate_size = 512
    hidden_act = "gelu"
    hidden_dropout_prob = 0.1
    attention_probs_dropout_prob = 0.1
    max_position_embeddings = 512
    type_vocab_size = 2
    initializer_range = 0.02
    layer_norm_eps = 1e-12
    pad_token_id = 0
    gradient_checkpointing = False
    position_embedding_type = "absolute"
    use_cache = True
    is_decoder = False

# BERT 입력 임베딩 생성 클래스
class BertEmbeddings(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        # 단어 임베딩, 위치 임베딩, 토큰 타입 임베딩
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        # 레이어 정규화와 드롭아웃
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False)
        self.register_buffer("token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False)

    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length: seq_length + past_key_values_length]

        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 입력 임베딩 생성
        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

# 셀프 어텐션 구현 클래스
class BertSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # hidden_size가 num_attention_heads의 배수가 아니면 오류 발생
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 어텐션 헤드의 수와 각 헤드의 크기, 전체 헤드 크기 설정
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(
            config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # Query, Key, Value 행렬 정의
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 드롭아웃 레이어 정의
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 위치 임베딩 유형 설정
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 상대적 위치 임베딩을 사용하는 경우, 위치 임베딩 레이어 정의
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(
                2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 디코더인지 여부 설정
        self.is_decoder = config.is_decoder

    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        # 텐서의 크기 변환
        new_x_shape = x.size()[
            :-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        # 텐서의 차원 변경 [batch_size, num_heads, seq_len, head_size]
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # Query 레이어 계산
        mixed_query_layer = self.query(hidden_states)

        # 크로스 어텐션인지 여부 확인
        is_cross_attention = encoder_hidden_states is not None

        if is_cross_attention and past_key_value is not None:
            # 과거의 k, v 값을 재사용 (크로스 어텐션)
            key_layer = past_key_value[0]
            value_layer = past_key_value[1]
            attention_mask = encoder_attention_mask
        elif is_cross_attention:
            # 인코더의 키와 값을 사용하여 크로스 어텐션 수행
            key_layer = self.transpose_for_scores(
                self.key(encoder_hidden_states))
            value_layer = self.transpose_for_scores(
                self.value(encoder_hidden_states))
            attention_mask = encoder_attention_mask
        elif past_key_value is not None:
            # 과거의 k, v 값을 현재의 k, v와 결합 (디코더의 셀프 어텐션)
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))
            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
        else:
            # 현재의 히든 스테이트에서 키와 값을 계산 (셀프 어텐션)
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))

        # Query 레이어 변환
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 캐시를 사용할지 여부 설정
        use_cache = past_key_value is not None
        if self.is_decoder:
            # 디코더인 경우, 키와 값을 캐싱
            past_key_value = (key_layer, value_layer)

        # Query와 Key의 내적(dot product)을 통해 어텐션 스코어 계산
        attention_scores = torch.matmul(
            query_layer, key_layer.transpose(-1, -2))

        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            # 상대적 위치 임베딩을 사용하는 경우
            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
            if use_cache:
                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
                    -1, 1
                )
            else:
                position_ids_l = torch.arange(
                    query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
            position_ids_r = torch.arange(
                key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
            distance = position_ids_l - position_ids_r

            # 거리 임베딩 계산
            positional_embedding = self.distance_embedding(
                distance + self.max_position_embeddings - 1)
            positional_embedding = positional_embedding.to(
                dtype=query_layer.dtype)  # fp16 호환성

            if self.position_embedding_type == "relative_key":
                # 상대적 위치 임베딩을 쿼리에 적용
                relative_position_scores = torch.einsum(
                    "bhld,lrd->bhlr", query_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores
            elif self.position_embedding_type == "relative_key_query":
                # 상대적 위치 임베딩을 쿼리와 키에 적용
                relative_position_scores_query = torch.einsum(
                    "bhld,lrd->bhlr", query_layer, positional_embedding)
                relative_position_scores_key = torch.einsum(
                    "bhrd,lrd->bhlr", key_layer, positional_embedding)
                attention_scores = attention_scores + \
                    relative_position_scores_query + relative_position_scores_key

        # 어텐션 스코어를 정규화
        attention_scores = attention_scores / \
            math.sqrt(self.attention_head_size)
        if attention_mask is not None:
            # 어텐션 마스크 적용
            attention_scores = attention_scores + attention_mask

        # 어텐션 스코어를 확률로 변환
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 드롭아웃 적용
        attention_probs = self.dropout(attention_probs)

        # 헤드 마스크 적용
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 컨텍스트 레이어 계산
        context_layer = torch.matmul(attention_probs, value_layer)

        # 텐서의 크기 변환 및 재배치
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[
            :-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 출력 생성
        outputs = (context_layer, attention_probs) if output_attentions else (
            context_layer,)

        # 디코더인 경우, past_key_value를 출력에 포함
        if self.is_decoder:
            outputs = outputs + (past_key_value,)
        return outputs

# 셀프 어텐션 출력 처리 클래스
class BertSelfOutput(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 드롭아웃, 레이어 정규화, 잔차 연결 적용
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

# 어텐션 메커니즘 클래스
class BertAttention(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False):
        # 셀프 어텐션 및 출력 계산
        self_outputs = self.self(
            input_tensor,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        attention_output = self.output(self_outputs[0], input_tensor)
        outputs = (attention_output,) + self_outputs[1:]
        return outputs

# 중간 레이어 활성화 함수 클래스
class BertIntermediate(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        # 중간 레이어 활성화 함수 적용
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

# 중간 레이어 출력 처리 클래스
class BertOutput(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 드롭아웃, 레이어 정규화, 잔차 연결 적용
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.Layer

    def forward(self, hidden_states, input_tensor):
        # 드롭아웃, 레이어 정규화, 잔차 연결 적용
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

# 하나의 BERT 레이어를 구현하는 클래스
class BertLayer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False):
        # 어텐션과 출력 계산
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions=output_attentions,
        )
        attention_output = self_attention_outputs[0]
        layer_output = self.output(self.intermediate(attention_output), attention_output)
        outputs = (layer_output,) + self_attention_outputs[1:]
        return outputs

# 여러 BERT 레이어를 포함하는 인코더 클래스
class BertEncoder(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True):
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
        for i, layer_module in enumerate(self.layer):
            layer_head_mask = head_mask[i] if head_mask is not None else None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_outputs = layer_module(
                hidden_states,
                attention_mask,
                layer_head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_value,
                output_attentions,
            )
            hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        return (hidden_states, all_hidden_states, all_attentions)

# 첫 번째 토큰의 출력을 풀링하는 클래스
class BertPooler(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # 첫 번째 토큰의 텐서를 사용해 풀링 출력 생성
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

# 전체 BERT 모델을 구현하는 클래스
class BertModel(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        # 입력 텐서의 크기 확인
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("input_ids 혹은 inputs_embeds 둘 중 하나의 형식으로만 입력해야 합니다.")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("input_ids 또는 inputs_embeds의 형식이어야 합니다.")

        device = input_ids.device if input_ids is not None else inputs_embeds.device
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        extended_attention_mask = attention_mask[:, None, None, :]
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        head_mask = [None] * self.config.num_hidden_layers

        # 임베딩 출력 계산
        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
        )
        # 인코더 출력 계산
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output)
        return sequence_output, pooled_output

class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.transform_act_fn = ACT2FN[config.hidden_act]
        self.LayerNorm = nn.LayerNorm(
            config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states

class BertLMPredictionHead(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.transform = BertPredictionHeadTransform(config)
        self.decoder = nn.Linear(
            config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states


class BertOnlyMLMHead(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.predictions = BertLMPredictionHead(config)

    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores

class BertForPreTraining(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert = BertModel(config)
        self.cls = BertOnlyMLMHead(config)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        return_dict = return_dict if return_dict is not None else True

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=None,
            attentions=None,
        )

In [6]:
class CSVDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=512):
        # 데이터셋 로드
        df = pd.read_csv(file_path)
        text = " ".join(df["text"].tolist())
        self.examples = []

        # 토크나이즈 및 블록 크기로 자르기
        for i in tqdm(range(0, len(text) - block_size, block_size), desc="Tokenizing text"):
            chunk = text[i:i + block_size]
            inputs = tokenizer(chunk, add_special_tokens=True, max_length=block_size, truncation=True, return_tensors="pt", padding="max_length")
            inputs['labels'] = inputs.input_ids.clone()
            self.examples.append(inputs)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {key: val.squeeze(0) for key, val in self.examples[i].items()}

# 데이터셋 저장 함수
def save_dataset(dataset, file_path):
    torch.save(dataset, file_path)

# 데이터셋 로드 함수
def load_dataset(file_path):
    return torch.load(file_path)

# 메인 함수
def main():
    # 데이터셋 경로 지정
    data_dir = "/content/drive/MyDrive/bookcorpus_reduced.csv"
    file_path = os.path.join(data_dir)
    processed_data_path = os.path.join("/content/drive/MyDrive/processed_dataset.pt")

    # 토크나이저 초기화
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # 데이터셋 전처리 및 저장
    if not os.path.exists(processed_data_path):
        full_dataset = CSVDataset(file_path, tokenizer)
        save_dataset(full_dataset, processed_data_path)
        print("전처리된 데이터셋 저장 완료.")
    else:
        full_dataset = load_dataset(processed_data_path)
        print("전처리된 데이터셋 불러오기 완료.")

    # 데이터셋 분할 (80% train, 10% validation, 10% test)
    train_size = int(0.8 * len(full_dataset))
    val_size = int(0.1 * len(full_dataset))
    test_size = len(full_dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

    # 데이터 로더 생성
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    train_dataloader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)
    validation_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
    test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

    return train_dataloader, validation_dataloader, test_dataloader, tokenizer

train_dataloader, validation_dataloader, test_dataloader, tokenizer = main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

전처리된 데이터셋 불러오기 완료.


In [10]:
def calculate_perplexity(loss):
    return math.exp(loss)

def train(model, train_dataloader, validation_dataloader, tokenizer, device, epochs=3):
    model.train()  # 모델을 학습 모드로 설정
    model.to(device)  # 모델을 지정된 장치로 이동
    optimizer = AdamW(model.parameters(), lr=5e-5)  # 옵티마이저 설정

    # 에포크만큼 반복
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        running_loss = 0.0

        # 데이터 로더에서 미니배치를 하나씩 가져와서 학습
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
            inputs = {key: val.to(device) for key, val in batch.items()}

            outputs = model(**inputs)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # 10번째 배치마다 현재 손실과 퍼플렉서티 출력
            if step % 10 == 0 and step != 0:
                current_loss = running_loss / (step + 1)
                current_perplexity = calculate_perplexity(current_loss)
                print(f"Batch {step}, Loss: {current_loss:.4f}, Perplexity: {current_perplexity:.4f}")

        # 에포크의 손실과 퍼플렉서티 출력
        epoch_loss = running_loss / len(train_dataloader)
        epoch_perplexity = calculate_perplexity(epoch_loss)
        print(f"Epoch {epoch + 1} Loss: {epoch_loss:.4f}, Perplexity: {epoch_perplexity:.4f}")

        # 검증 단계
        model.eval()  # 모델을 평가 모드로 설정
        validation_loss = 0.0

        with torch.no_grad():
            for step, batch in enumerate(tqdm(validation_dataloader, desc="Validating")):
                inputs = {key: val.to(device) for key, val in batch.items()}

                outputs = model(**inputs)
                loss = outputs.loss

                validation_loss += loss.item()

        # 검증 손실과 퍼플렉서티 출력
        epoch_val_loss = validation_loss / len(validation_dataloader)
        epoch_val_perplexity = calculate_perplexity(epoch_val_loss)
        print(f"Validation Loss: {epoch_val_loss:.4f}, Perplexity: {epoch_val_perplexity:.4f}")

        model.train()  # 평가 후에 모델을 다시 학습 모드로 설정

def test(model, test_dataloader, device):
    model.eval()  # 모델을 평가 모드로 설정
    test_loss = 0.0

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Testing"):
            inputs = {key: val.to(device) for key, val in batch.items()}

            outputs = model(**inputs)
            loss = outputs.loss

            test_loss += loss.item()

    # 테스트 데이터셋의 퍼플렉서티 출력
    test_loss = test_loss / len(test_dataloader)
    test_perplexity = calculate_perplexity(test_loss)
    print(f"Test Loss: {test_loss:.4f}, Perplexity: {test_perplexity:.4f}")

# 데이터셋과 데이터 로더 설정
# (이 부분은 주어진 코드에서 제공된 데이터 로더를 사용한다고 가정합니다)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForPreTraining(Config)
train(model, train_dataloader, validation_dataloader, tokenizer, device, epochs=3)
test(model, test_dataloader, device)

Epoch 1/3


Training:   4%|▎         | 11/294 [00:03<01:22,  3.43it/s]

Batch 10, Loss: 9.9531, Perplexity: 21018.1230


Training:   7%|▋         | 21/294 [00:06<01:22,  3.33it/s]

Batch 20, Loss: 9.7015, Perplexity: 16341.3863


Training:  11%|█         | 31/294 [00:09<01:17,  3.38it/s]

Batch 30, Loss: 9.5182, Perplexity: 13605.4283


Training:  14%|█▍        | 41/294 [00:12<01:13,  3.46it/s]

Batch 40, Loss: 9.3463, Perplexity: 11456.4402


Training:  17%|█▋        | 51/294 [00:15<01:12,  3.37it/s]

Batch 50, Loss: 9.1962, Perplexity: 9859.3216


Training:  21%|██        | 61/294 [00:18<01:10,  3.32it/s]

Batch 60, Loss: 9.0393, Perplexity: 8427.5275


Training:  24%|██▍       | 71/294 [00:21<01:08,  3.27it/s]

Batch 70, Loss: 8.9016, Perplexity: 7343.4447


Training:  28%|██▊       | 81/294 [00:24<01:03,  3.36it/s]

Batch 80, Loss: 8.7733, Perplexity: 6459.3717


Training:  31%|███       | 91/294 [00:27<01:00,  3.37it/s]

Batch 90, Loss: 8.6684, Perplexity: 5816.0925


Training:  34%|███▍      | 101/294 [00:30<00:57,  3.34it/s]

Batch 100, Loss: 8.5709, Perplexity: 5275.8567


Training:  38%|███▊      | 111/294 [00:33<00:56,  3.25it/s]

Batch 110, Loss: 8.4778, Perplexity: 4806.6248


Training:  41%|████      | 121/294 [00:36<00:52,  3.29it/s]

Batch 120, Loss: 8.3950, Perplexity: 4424.7010


Training:  45%|████▍     | 131/294 [00:39<00:49,  3.27it/s]

Batch 130, Loss: 8.3107, Perplexity: 4067.2189


Training:  48%|████▊     | 141/294 [00:42<00:46,  3.31it/s]

Batch 140, Loss: 8.2323, Perplexity: 3760.4705


Training:  51%|█████▏    | 151/294 [00:45<00:43,  3.28it/s]

Batch 150, Loss: 8.1721, Perplexity: 3540.6773


Training:  55%|█████▍    | 161/294 [00:48<00:41,  3.21it/s]

Batch 160, Loss: 8.0968, Perplexity: 3283.9575


Training:  58%|█████▊    | 171/294 [00:51<00:37,  3.32it/s]

Batch 170, Loss: 8.0481, Perplexity: 3127.9620


Training:  62%|██████▏   | 181/294 [00:54<00:33,  3.33it/s]

Batch 180, Loss: 7.9880, Perplexity: 2945.4126


Training:  65%|██████▍   | 191/294 [00:57<00:30,  3.35it/s]

Batch 190, Loss: 7.9457, Perplexity: 2823.4168


Training:  68%|██████▊   | 201/294 [01:00<00:28,  3.27it/s]

Batch 200, Loss: 7.9062, Perplexity: 2713.9263


Training:  72%|███████▏  | 211/294 [01:03<00:25,  3.30it/s]

Batch 210, Loss: 7.8632, Perplexity: 2599.7707


Training:  75%|███████▌  | 221/294 [01:06<00:21,  3.35it/s]

Batch 220, Loss: 7.8270, Perplexity: 2507.2837


Training:  79%|███████▊  | 231/294 [01:09<00:18,  3.35it/s]

Batch 230, Loss: 7.7948, Perplexity: 2427.8535


Training:  82%|████████▏ | 241/294 [01:12<00:15,  3.37it/s]

Batch 240, Loss: 7.7582, Perplexity: 2340.7134


Training:  85%|████████▌ | 251/294 [01:15<00:13,  3.29it/s]

Batch 250, Loss: 7.7279, Perplexity: 2270.9160


Training:  89%|████████▉ | 261/294 [01:18<00:09,  3.37it/s]

Batch 260, Loss: 7.6962, Perplexity: 2199.9525


Training:  92%|█████████▏| 271/294 [01:21<00:06,  3.35it/s]

Batch 270, Loss: 7.6675, Perplexity: 2137.7989


Training:  96%|█████████▌| 281/294 [01:24<00:03,  3.38it/s]

Batch 280, Loss: 7.6403, Perplexity: 2080.3319


Training:  99%|█████████▉| 291/294 [01:27<00:00,  3.32it/s]

Batch 290, Loss: 7.6219, Perplexity: 2042.3705


Training: 100%|██████████| 294/294 [01:28<00:00,  3.33it/s]


Epoch 1 Loss: 7.6145, Perplexity: 2027.3952


Validating: 100%|██████████| 37/37 [00:04<00:00,  8.90it/s]


Validation Loss: 6.9403, Perplexity: 1033.0464
Epoch 2/3


Training:   4%|▎         | 11/294 [00:03<01:24,  3.35it/s]

Batch 10, Loss: 6.9680, Perplexity: 1062.0856


Training:   7%|▋         | 21/294 [00:06<01:20,  3.38it/s]

Batch 20, Loss: 6.8814, Perplexity: 974.0323


Training:  11%|█         | 31/294 [00:09<01:19,  3.31it/s]

Batch 30, Loss: 6.8795, Perplexity: 972.0942


Training:  14%|█▍        | 41/294 [00:12<01:17,  3.27it/s]

Batch 40, Loss: 6.8846, Perplexity: 977.1231


Training:  17%|█▋        | 51/294 [00:15<01:12,  3.34it/s]

Batch 50, Loss: 6.8889, Perplexity: 981.3024


Training:  21%|██        | 61/294 [00:18<01:09,  3.36it/s]

Batch 60, Loss: 6.8957, Perplexity: 988.0271


Training:  24%|██▍       | 71/294 [00:21<01:06,  3.35it/s]

Batch 70, Loss: 6.9133, Perplexity: 1005.5798


Training:  28%|██▊       | 81/294 [00:24<01:04,  3.29it/s]

Batch 80, Loss: 6.9033, Perplexity: 995.5911


Training:  31%|███       | 91/294 [00:27<01:00,  3.34it/s]

Batch 90, Loss: 6.8978, Perplexity: 990.1131


Training:  34%|███▍      | 101/294 [00:30<00:57,  3.35it/s]

Batch 100, Loss: 6.9113, Perplexity: 1003.5563


Training:  38%|███▊      | 111/294 [00:33<00:54,  3.34it/s]

Batch 110, Loss: 6.9079, Perplexity: 1000.1821


Training:  41%|████      | 121/294 [00:36<00:52,  3.32it/s]

Batch 120, Loss: 6.9003, Perplexity: 992.5816


Training:  45%|████▍     | 131/294 [00:39<00:50,  3.25it/s]

Batch 130, Loss: 6.9029, Perplexity: 995.1646


Training:  48%|████▊     | 141/294 [00:42<00:45,  3.34it/s]

Batch 140, Loss: 6.9068, Perplexity: 999.0618


Training:  51%|█████▏    | 151/294 [00:45<00:42,  3.35it/s]

Batch 150, Loss: 6.9079, Perplexity: 1000.1475


Training:  55%|█████▍    | 161/294 [00:48<00:40,  3.32it/s]

Batch 160, Loss: 6.9072, Perplexity: 999.4559


Training:  58%|█████▊    | 171/294 [00:51<00:37,  3.28it/s]

Batch 170, Loss: 6.9059, Perplexity: 998.1313


Training:  62%|██████▏   | 181/294 [00:54<00:34,  3.29it/s]

Batch 180, Loss: 6.8988, Perplexity: 991.0684


Training:  65%|██████▍   | 191/294 [00:57<00:30,  3.35it/s]

Batch 190, Loss: 6.8992, Perplexity: 991.5124


Training:  68%|██████▊   | 201/294 [01:00<00:27,  3.36it/s]

Batch 200, Loss: 6.9019, Perplexity: 994.2046


Training:  72%|███████▏  | 211/294 [01:03<00:24,  3.34it/s]

Batch 210, Loss: 6.8976, Perplexity: 989.8664


Training:  75%|███████▌  | 221/294 [01:06<00:22,  3.24it/s]

Batch 220, Loss: 6.8967, Perplexity: 989.0162


Training:  79%|███████▊  | 231/294 [01:09<00:18,  3.35it/s]

Batch 230, Loss: 6.8958, Perplexity: 988.1514


Training:  82%|████████▏ | 241/294 [01:12<00:15,  3.33it/s]

Batch 240, Loss: 6.8911, Perplexity: 983.5301


Training:  85%|████████▌ | 251/294 [01:15<00:12,  3.35it/s]

Batch 250, Loss: 6.8875, Perplexity: 979.9803


Training:  89%|████████▉ | 261/294 [01:18<00:10,  3.29it/s]

Batch 260, Loss: 6.8836, Perplexity: 976.1403


Training:  92%|█████████▏| 271/294 [01:21<00:07,  3.24it/s]

Batch 270, Loss: 6.8820, Perplexity: 974.5742


Training:  96%|█████████▌| 281/294 [01:24<00:03,  3.35it/s]

Batch 280, Loss: 6.8767, Perplexity: 969.4452


Training:  99%|█████████▉| 291/294 [01:27<00:00,  3.35it/s]

Batch 290, Loss: 6.8785, Perplexity: 971.1448


Training: 100%|██████████| 294/294 [01:28<00:00,  3.33it/s]


Epoch 2 Loss: 6.8764, Perplexity: 969.1632


Validating: 100%|██████████| 37/37 [00:04<00:00,  8.97it/s]


Validation Loss: 6.8380, Perplexity: 932.6127
Epoch 3/3


Training:   4%|▎         | 11/294 [00:03<01:26,  3.28it/s]

Batch 10, Loss: 6.8497, Perplexity: 943.6016


Training:   7%|▋         | 21/294 [00:06<01:21,  3.35it/s]

Batch 20, Loss: 6.7457, Perplexity: 850.4226


Training:  11%|█         | 31/294 [00:09<01:18,  3.35it/s]

Batch 30, Loss: 6.7564, Perplexity: 859.5635


Training:  14%|█▍        | 41/294 [00:12<01:15,  3.35it/s]

Batch 40, Loss: 6.7292, Perplexity: 836.4855


Training:  17%|█▋        | 51/294 [00:15<01:14,  3.24it/s]

Batch 50, Loss: 6.6958, Perplexity: 808.9949


Training:  21%|██        | 61/294 [00:18<01:09,  3.33it/s]

Batch 60, Loss: 6.6919, Perplexity: 805.8586


Training:  24%|██▍       | 71/294 [00:21<01:06,  3.34it/s]

Batch 70, Loss: 6.6942, Perplexity: 807.7361


Training:  28%|██▊       | 81/294 [00:24<01:03,  3.35it/s]

Batch 80, Loss: 6.7025, Perplexity: 814.4719


Training:  31%|███       | 91/294 [00:27<01:01,  3.32it/s]

Batch 90, Loss: 6.7109, Perplexity: 821.3111


Training:  34%|███▍      | 101/294 [00:30<00:59,  3.25it/s]

Batch 100, Loss: 6.7295, Perplexity: 836.7083


Training:  38%|███▊      | 111/294 [00:33<00:54,  3.34it/s]

Batch 110, Loss: 6.7407, Perplexity: 846.1346


Training:  41%|████      | 121/294 [00:36<00:51,  3.35it/s]

Batch 120, Loss: 6.7421, Perplexity: 847.3635


Training:  45%|████▍     | 131/294 [00:39<00:48,  3.34it/s]

Batch 130, Loss: 6.7459, Perplexity: 850.6008


Training:  48%|████▊     | 141/294 [00:42<00:46,  3.26it/s]

Batch 140, Loss: 6.7426, Perplexity: 847.7219


Training:  51%|█████▏    | 151/294 [00:45<00:43,  3.28it/s]

Batch 150, Loss: 6.7449, Perplexity: 849.7062


Training:  55%|█████▍    | 161/294 [00:48<00:39,  3.34it/s]

Batch 160, Loss: 6.7388, Perplexity: 844.5821


Training:  58%|█████▊    | 171/294 [00:51<00:36,  3.34it/s]

Batch 170, Loss: 6.7442, Perplexity: 849.1082


Training:  62%|██████▏   | 181/294 [00:54<00:33,  3.34it/s]

Batch 180, Loss: 6.7461, Perplexity: 850.7157


Training:  65%|██████▍   | 191/294 [00:57<00:31,  3.25it/s]

Batch 190, Loss: 6.7539, Perplexity: 857.4069


Training:  68%|██████▊   | 201/294 [01:00<00:27,  3.32it/s]

Batch 200, Loss: 6.7574, Perplexity: 860.4118


Training:  72%|███████▏  | 211/294 [01:03<00:25,  3.31it/s]

Batch 210, Loss: 6.7616, Perplexity: 863.9864


Training:  75%|███████▌  | 221/294 [01:06<00:21,  3.34it/s]

Batch 220, Loss: 6.7598, Perplexity: 862.4365


Training:  79%|███████▊  | 231/294 [01:09<00:19,  3.29it/s]

Batch 230, Loss: 6.7589, Perplexity: 861.6585


Training:  82%|████████▏ | 241/294 [01:12<00:16,  3.25it/s]

Batch 240, Loss: 6.7561, Perplexity: 859.2545


Training:  85%|████████▌ | 251/294 [01:15<00:13,  3.30it/s]

Batch 250, Loss: 6.7623, Perplexity: 864.6546


Training:  89%|████████▉ | 261/294 [01:18<00:10,  3.23it/s]

Batch 260, Loss: 6.7615, Perplexity: 863.9716


Training:  92%|█████████▏| 271/294 [01:21<00:06,  3.34it/s]

Batch 270, Loss: 6.7599, Perplexity: 862.5566


Training:  96%|█████████▌| 281/294 [01:24<00:04,  3.25it/s]

Batch 280, Loss: 6.7544, Perplexity: 857.8253


Training:  99%|█████████▉| 291/294 [01:27<00:00,  3.31it/s]

Batch 290, Loss: 6.7537, Perplexity: 857.2307


Training: 100%|██████████| 294/294 [01:28<00:00,  3.31it/s]


Epoch 3 Loss: 6.7525, Perplexity: 856.1725


Validating: 100%|██████████| 37/37 [00:04<00:00,  9.08it/s]


Validation Loss: 6.8202, Perplexity: 916.1976


Testing: 100%|██████████| 37/37 [00:04<00:00,  9.13it/s]

Test Loss: 6.8058, Perplexity: 903.0878





In [8]:
def clear_cuda_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# Example usage
clear_cuda_memory()
