### 라이브러리 임포트

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## 모델 정의

### embedding layer (동결)

In [2]:
# Llama3 모델의 토크나이저를 불러옵니다.
model_name = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 임베딩 레이어를 정의합니다.
embedding_layer = nn.Embedding.from_pretrained(torch.empty((128256, 4096)))  # 크기는 예시입니다.

# 임베딩 레이어를 추출한 후 이를 별도의 nn.Module로 래핑합니다.
class EmbeddingsOnlyModel(nn.Module):
    def __init__(self, embedding_layer):
        super(EmbeddingsOnlyModel, self).__init__()
        self.embeddings = embedding_layer

    def forward(self, input_ids):
        return self.embeddings(input_ids)

# 래핑된 모델 인스턴스를 생성합니다.
embeddings_only_model = EmbeddingsOnlyModel(embedding_layer)

# 저장된 모델 상태를 불러옵니다.
model_path = "embeddings_only_model.pth"
embeddings_only_model.load_state_dict(torch.load(model_path))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<All keys matched successfully>

In [3]:
for param in embeddings_only_model.parameters():
    param.requires_grad = False

### Transformer is what I need

In [4]:
# 트랜스포머 인코더 레이어 정의
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(TransformerEncoderLayer, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)

    def forward(self, x):
        return self.transformer_encoder(x)

### 최종 모델

In [5]:
class SentenceClassificationModel(nn.Module):
    def __init__(self, embedding_layer, embed_dim, num_heads, num_classes):
        super(SentenceClassificationModel, self).__init__()
        self.embeddings = embedding_layer
        self.transformer_encoder = TransformerEncoderLayer(embed_dim, num_heads)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, num_classes)
        )

    def forward(self, input_ids):
        x = self.embeddings(input_ids)
        x = x.transpose(0, 1)  # 트랜스포머 인코더를 위해 차원 변환
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)  # 인코더 출력의 평균을 구하여 문장 벡터로 변환
        x = self.fc(x)
        return x

In [6]:
# 모델 생성
embedding_layer = nn.Embedding(128256, 4096)  # 크기는 예시입니다.
embed_dim = 4096
num_heads = 8
num_classes = 500
model = SentenceClassificationModel(embedding_layer, embed_dim, num_heads, num_classes)

# 입력 텍스트를 토큰화합니다.
tokenizer = AutoTokenizer.from_pretrained("MLP-KTLim/llama-3-Korean-Bllossom-8B")
input_text = "안녕 얘들아!! 오랬만이야"
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# 모델을 사용하여 예측합니다.
model.eval()
with torch.no_grad():
    output = model(input_ids)
    print("Classification output:", output)

Classification output: tensor([[ 5.3601e-02, -3.3348e-03, -1.5144e-02, -5.3594e-02,  5.3084e-03,
          4.0774e-02,  2.1909e-01,  1.2301e-01, -4.7107e-02, -2.0732e-02,
          1.8216e-01, -4.8974e-02, -1.5258e-01, -1.5633e-04,  6.4647e-02,
         -3.5073e-02, -1.6617e-02, -4.3771e-02, -7.9014e-04, -7.9645e-02,
         -4.5365e-03,  1.0553e-02, -3.7300e-02, -1.0119e-01, -1.5137e-02,
          1.8919e-02,  1.3214e-01,  3.0318e-02,  2.9091e-02,  1.2471e-01,
         -6.4203e-02,  3.6150e-02, -2.1405e-01,  8.0541e-02,  1.4735e-01,
         -2.3981e-03,  3.0605e-02, -7.1291e-02,  1.2022e-01,  3.5830e-02,
          3.3254e-02, -5.4950e-02,  6.1543e-02, -1.1629e-01, -2.6629e-02,
          2.9524e-02,  2.6034e-02, -4.8102e-02,  2.7468e-02, -6.6219e-02,
          3.8945e-02,  1.3010e-01,  1.4149e-01,  5.3079e-02,  6.9672e-02,
         -1.2209e-01, -3.7857e-02, -7.9546e-02,  1.5473e-01,  1.4894e-01,
         -3.4785e-03,  2.8395e-02,  1.3767e-01,  1.8539e-02,  1.7168e-02,
         -9.655