In [None]:
!pip install transformers torch pandas numpy scikit-learn tqdm rank-bm25 -q


In [None]:
import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        self.d_model = d_model
        self.n_head = n_head
        self.head_dim = d_model // n_head

        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.o = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        B, L, D = x.shape
        qkv = self.qkv(x).reshape(B, L, 3, self.n_head, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, B, n_head, L, head_dim)
        q, k, v = qkv[0], qkv[1], qkv[2]

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).reshape(B, L, D)
        out = self.o(out)

        return out

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        self.sa = SelfAttention(d_model, n_head)
        self.ln1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(0.1)
        )
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        x = x + self.sa(self.ln1(x), mask)
        x = x + self.ff(self.ln2(x))
        return x

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, n_head, n_layer):
        super().__init__()
        self.tok = nn.Embedding(vocab_size, d_model)
        self.pos = nn.Embedding(max_len, d_model)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, n_head) for _ in range(n_layer)
        ])
        self.ln = nn.LayerNorm(d_model)
        self.d_model = d_model

    def forward(self, input_ids, attention_mask=None):
        B, L = input_ids.shape

        tok_emb = self.tok(input_ids)
        pos_ids = torch.arange(L, device=input_ids.device).unsqueeze(0).expand(B, -1)
        pos_emb = self.pos(pos_ids)

        x = tok_emb + pos_emb

        # Attention mask 변환
        if attention_mask is not None:
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        for block in self.blocks:
            x = block(x, attention_mask)

        x = self.ln(x)
        return x

class CustomModelForSequenceClassification(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, n_head, n_layer, num_labels=2):
        super().__init__()
        self.enc = Encoder(vocab_size, d_model, max_len, n_head, n_layer)
        self.classifier = nn.Linear(d_model, num_labels)
        self.num_labels = num_labels

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Encoder 출력
        encoder_output = self.enc(input_ids, attention_mask)

        # [CLS] 토큰 사용 (첫 번째 토큰)
        cls_output = encoder_output[:, 0, :]

        # Classification
        logits = self.classifier(cls_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        # Hugging Face 스타일 출력
        if loss is not None:
            return type('Output', (), {'loss': loss, 'logits': logits})()
        else:
            return type('Output', (), {'logits': logits})()

print("✅ 커스텀 모델 클래스 정의 완료!")

✅ 커스텀 모델 클래스 정의 완료!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
import datetime
import random
from sklearn.model_selection import train_test_split

# 재현성을 위한 시드 설정
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)
print("✅ 라이브러리 임포트 완료!")

Mounted at /content/drive
✅ 라이브러리 임포트 완료!


In [None]:
import zipfile
import os

# open.zip 압축 해제
zip_path = "/content/drive/MyDrive/Encoder-only Transformer/open.zip"
extract_path = "/content/data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 압축 해제 확인
!ls -la /content/data

print("✅ 데이터 압축 해제 완료!")

total 158776
drwxr-xr-x   3 root root      4096 Oct 20 07:28 .
drwxr-xr-x   1 root root      4096 Oct 20 07:28 ..
drwxr-xr-x 302 root root     12288 Oct 20 07:28 code
-rw-r--r--   1 root root   1685911 Oct 20 07:28 sample_submission.csv
-rw-r--r--   1 root root  14808195 Oct 20 07:28 sample_train.csv
-rw-r--r--   1 root root 146065471 Oct 20 07:28 test.csv
✅ 데이터 압축 해제 완료!


In [None]:
from tokenizers import Tokenizer

# 커스텀 BPE 토크나이저 로드
tokenizer_path = "/content/drive/MyDrive/Encoder-only Transformer/tokenizer_py_bpe.json"
tokenizer = Tokenizer.from_file(tokenizer_path)

# Special token IDs
PAD = tokenizer.token_to_id("<pad>")
BOS = tokenizer.token_to_id("<bos>")
EOS = tokenizer.token_to_id("<eos>")
UNK = tokenizer.token_to_id("<unk>")

print(f"✅ 커스텀 BPE 토크나이저 로드 완료!")
print(f"✅ Vocab size: {tokenizer.get_vocab_size()}")
print(f"✅ PAD={PAD}, BOS={BOS}, EOS={EOS}, UNK={UNK}")

# 모델 파일 로드
model_path = "/content/drive/MyDrive/Encoder-only Transformer/pretrain_best_ep2_sh_pairs_clean_004.pt"
checkpoint = torch.load(model_path, map_location='cpu')

print("\n📦 체크포인트 키:", list(checkpoint.keys()))
print("\n🧩 cfg 내용:")
cfg = checkpoint['cfg']
print(cfg)

# 모델 인스턴스 생성
vocab_size = tokenizer.get_vocab_size()
model = CustomModelForSequenceClassification(
    vocab_size=vocab_size,
    d_model=cfg['D_MODEL'],
    max_len=cfg['MAX_LEN'],
    n_head=cfg['N_HEAD'],
    n_layer=cfg['N_LAYER'],
    num_labels=2
)

print(f"\n✅ 모델 생성 완료!")
print(f"✅ Model config: D_MODEL={cfg['D_MODEL']}, MAX_LEN={cfg['MAX_LEN']}, N_HEAD={cfg['N_HEAD']}, N_LAYER={cfg['N_LAYER']}")

✅ 커스텀 BPE 토크나이저 로드 완료!
✅ Vocab size: 12406
✅ PAD=0, BOS=2, EOS=3, UNK=1

📦 체크포인트 키: ['model', 'logit_scale', 'cfg']

🧩 cfg 내용:
{'MAX_LEN': 384, 'D_MODEL': 384, 'N_LAYER': 4, 'N_HEAD': 6}

✅ 모델 생성 완료!
✅ Model config: D_MODEL=384, MAX_LEN=384, N_HEAD=6, N_LAYER=4


In [None]:
# 사전학습된 encoder 가중치 로드
pretrained_state = checkpoint['model']

# 실제 사전학습 모델의 MAX_LEN 확인
# pos.weight의 shape에서 실제 max_len 추출
pos_weight_shape = pretrained_state['enc.pos.weight'].shape
actual_max_len = pos_weight_shape[0]

print(f"⚠️  cfg의 MAX_LEN: {cfg['MAX_LEN']}")
print(f"✅ 실제 pos.weight의 MAX_LEN: {actual_max_len}")
print(f"→ 모델을 실제 MAX_LEN={actual_max_len}로 재생성합니다.\n")

# 모델 재생성 (실제 MAX_LEN 사용)
vocab_size = tokenizer.get_vocab_size()
model = CustomModelForSequenceClassification(
    vocab_size=vocab_size,
    d_model=cfg['D_MODEL'],
    max_len=actual_max_len,  # 실제 길이 사용
    n_head=cfg['N_HEAD'],
    n_layer=cfg['N_LAYER'],
    num_labels=2
)

# Encoder 가중치 로드
model_state = model.state_dict()
encoder_state = {}

for key, value in pretrained_state.items():
    if key.startswith('enc.'):
        encoder_state[key] = value

# 가중치 로드
missing_keys, unexpected_keys = model.load_state_dict(encoder_state, strict=False)

print("✅ 사전학습된 Encoder 가중치 로드 완료!")
print(f"✅ 로드된 파라미터: {len(encoder_state)} keys")
print(f"✅ Missing keys (새로 학습될 부분): {missing_keys}")
print(f"✅ Unexpected keys: {unexpected_keys}")

⚠️  cfg의 MAX_LEN: 384
✅ 실제 pos.weight의 MAX_LEN: 2048
→ 모델을 실제 MAX_LEN=2048로 재생성합니다.

✅ 사전학습된 Encoder 가중치 로드 완료!
✅ 로드된 파라미터: 52 keys
✅ Missing keys (새로 학습될 부분): ['classifier.weight', 'classifier.bias']
✅ Unexpected keys: []


In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

print("✅ 유틸리티 함수 정의 완료!")

✅ 유틸리티 함수 정의 완료!


In [None]:
def tokenize_code_pair(code1, code2, tokenizer, max_len=384, pad_id=0):
    """
    두 코드를 토크나이징하고 concatenate
    BPE 토크나이저는 pair를 자동으로 처리하므로 encode 사용
    """
    # 두 코드를 pair로 인코딩
    encoding = tokenizer.encode(code1, code2)

    # IDs와 attention mask 추출
    ids = encoding.ids

    # 길이 제한 및 패딩
    if len(ids) > max_len:
        ids = ids[:max_len]

    attention_mask = [1] * len(ids)

    # 패딩
    padding_length = max_len - len(ids)
    ids = ids + [pad_id] * padding_length
    attention_mask = attention_mask + [0] * padding_length

    return ids, attention_mask

print("✅ 토크나이징 함수 정의 완료!")

✅ 토크나이징 함수 정의 완료!


In [None]:
# 데이터 로드
data_path = "/content/data"
train_df = pd.read_csv(f"{data_path}/sample_train.csv")

print(f"Train data shape: {train_df.shape}")
print(train_df.head())
print(f"\nLabel distribution:\n{train_df['similar'].value_counts()}")

# Train/Validation 분할 (80:20)
train_data, val_data = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['similar']
)

print(f"\n✅ Train size: {len(train_data)}")
print(f"✅ Validation size: {len(val_data)}")

Train data shape: (17970, 3)
                                               code1  \
0  flag = "go"\ncnt = 0\nwhile flag == "go":\n   ...   
1   b, c = map(int, input().split())\n\nprint(b * c)   
2  import numpy as np\nimport sys\nread = sys.std...   
3   b, c = map(int, input().split())\n\nprint(b * c)   
4  s=input()\nt=input()\nans=0\nfor i in range(le...   

                                               code2  similar  
0  # Python 3+\n#--------------------------------...        1  
1  import numpy as np\n\nn = int(input())\na = np...        0  
2  N, M = map(int, input().split())\nif M%2 != 0:...        0  
3  n,m=map(int,input().split())\nh=list(map(int,i...        0  
4  import math\na,b,h,m=map(int,input().split())\...        0  

Label distribution:
similar
1    9005
0    8965
Name: count, dtype: int64

✅ Train size: 14376
✅ Validation size: 3594


In [None]:
MAX_LEN = 384  # 실제 모델 max_len (2048도 가능하지만 메모리 고려)

print("Train 데이터 토크나이징 중...")

train_input_ids = []
train_attention_masks = []

for idx in tqdm(range(len(train_data)), desc="Tokenizing Train"):
    row = train_data.iloc[idx]
    ids, mask = tokenize_code_pair(
        str(row['code1']),
        str(row['code2']),
        tokenizer,
        max_len=MAX_LEN,
        pad_id=PAD
    )
    train_input_ids.append(ids)
    train_attention_masks.append(mask)

train_input_ids = torch.tensor(train_input_ids, dtype=torch.long)
train_attention_masks = torch.tensor(train_attention_masks, dtype=torch.long)
train_labels = torch.tensor(train_data['similar'].values, dtype=torch.long)

print(f"✅ Train input_ids shape: {train_input_ids.shape}")
print(f"✅ Train attention_masks shape: {train_attention_masks.shape}")
print(f"✅ Train labels shape: {train_labels.shape}")

Train 데이터 토크나이징 중...


Tokenizing Train:   0%|          | 0/14376 [00:00<?, ?it/s]

✅ Train input_ids shape: torch.Size([14376, 384])
✅ Train attention_masks shape: torch.Size([14376, 384])
✅ Train labels shape: torch.Size([14376])


In [None]:
print("Validation 데이터 토크나이징 중...")

val_input_ids = []
val_attention_masks = []

for idx in tqdm(range(len(val_data)), desc="Tokenizing Validation"):
    row = val_data.iloc[idx]
    ids, mask = tokenize_code_pair(
        str(row['code1']),
        str(row['code2']),
        tokenizer,
        max_len=MAX_LEN,
        pad_id=PAD
    )
    val_input_ids.append(ids)
    val_attention_masks.append(mask)

val_input_ids = torch.tensor(val_input_ids, dtype=torch.long)
val_attention_masks = torch.tensor(val_attention_masks, dtype=torch.long)
val_labels = torch.tensor(val_data['similar'].values, dtype=torch.long)

print(f"✅ Validation input_ids shape: {val_input_ids.shape}")
print(f"✅ Validation attention_masks shape: {val_attention_masks.shape}")
print(f"✅ Validation labels shape: {val_labels.shape}")

Validation 데이터 토크나이징 중...


Tokenizing Validation:   0%|          | 0/3594 [00:00<?, ?it/s]

✅ Validation input_ids shape: torch.Size([3594, 384])
✅ Validation attention_masks shape: torch.Size([3594, 384])
✅ Validation labels shape: torch.Size([3594])


In [None]:
BATCH_SIZE = 16  # GPU 메모리에 따라 조정 가능

# Train DataLoader
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

# Validation DataLoader
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=BATCH_SIZE)

print(f"✅ Train batches: {len(train_dataloader)}")
print(f"✅ Validation batches: {len(val_dataloader)}")

✅ Train batches: 899
✅ Validation batches: 225


In [None]:
from transformers import get_linear_schedule_with_warmup

EPOCHS = 3
LEARNING_RATE = 2e-5

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)

# Scheduler
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

print(f"✅ Device: {device}")
print(f"✅ Total training steps: {total_steps}")
print(f"✅ Learning rate: {LEARNING_RATE}")

✅ Device: cuda
✅ Total training steps: 2697
✅ Learning rate: 2e-05


In [None]:
# 손실 함수
loss_fn = nn.CrossEntropyLoss()

# 결과 저장용
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []

print("=" * 50)
print("파인튜닝 시작!")
print("=" * 50)

for epoch in range(EPOCHS):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"{'='*50}")

    # ========== Training ==========
    print("\n[Training]")
    t0 = time.time()
    model.train()

    total_train_loss = 0
    total_train_acc = 0

    for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        # Forward pass
        outputs = model(
            b_input_ids,
            attention_mask=b_attention_mask,
            labels=b_labels
        )

        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Accuracy 계산
        logits_np = logits.detach().cpu().numpy()
        labels_np = b_labels.cpu().numpy()
        total_train_acc += flat_accuracy(logits_np, labels_np)

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_acc = total_train_acc / len(train_dataloader)

    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_acc)

    print(f"  Average training loss: {avg_train_loss:.4f}")
    print(f"  Average training accuracy: {avg_train_acc:.4f}")
    print(f"  Training time: {format_time(time.time() - t0)}")

    # ========== Validation ==========
    print("\n[Validation]")
    t0 = time.time()
    model.eval()

    total_val_loss = 0
    total_val_acc = 0

    for batch in tqdm(val_dataloader, desc="Validating"):
        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                attention_mask=b_attention_mask,
                labels=b_labels
            )

        loss = outputs.loss
        logits = outputs.logits

        total_val_loss += loss.item()

        logits_np = logits.cpu().numpy()
        labels_np = b_labels.cpu().numpy()
        total_val_acc += flat_accuracy(logits_np, labels_np)

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = total_val_acc / len(val_dataloader)

    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_acc)

    print(f"  Average validation loss: {avg_val_loss:.4f}")
    print(f"  Average validation accuracy: {avg_val_acc:.4f}")
    print(f"  Validation time: {format_time(time.time() - t0)}")

    # 모델 저장
    save_path = f"/content/drive/MyDrive/Encoder-only Transformer/finetuned_custom_epoch{epoch+1}.pt"
    torch.save(model.state_dict(), save_path)
    print(f"  ✅ Model saved: {save_path}")

print("\n" + "="*50)
print("파인튜닝 완료!")
print("="*50)

파인튜닝 시작!

Epoch 1/3

[Training]


Training:   0%|          | 0/899 [00:00<?, ?it/s]

  Average training loss: 0.6404
  Average training accuracy: 0.6283
  Training time: 0:01:04

[Validation]


Validating:   0%|          | 0/225 [00:00<?, ?it/s]

  Average validation loss: 0.5844
  Average validation accuracy: 0.6947
  Validation time: 0:00:05
  ✅ Model saved: /content/drive/MyDrive/Encoder-only Transformer/finetuned_custom_epoch1.pt

Epoch 2/3

[Training]


Training:   0%|          | 0/899 [00:00<?, ?it/s]

  Average training loss: 0.5396
  Average training accuracy: 0.7339
  Training time: 0:01:04

[Validation]


Validating:   0%|          | 0/225 [00:00<?, ?it/s]

  Average validation loss: 0.5407
  Average validation accuracy: 0.7390
  Validation time: 0:00:05
  ✅ Model saved: /content/drive/MyDrive/Encoder-only Transformer/finetuned_custom_epoch2.pt

Epoch 3/3

[Training]


Training:   0%|          | 0/899 [00:00<?, ?it/s]

  Average training loss: 0.4798
  Average training accuracy: 0.7755
  Training time: 0:01:06

[Validation]


Validating:   0%|          | 0/225 [00:00<?, ?it/s]

  Average validation loss: 0.5230
  Average validation accuracy: 0.7509
  Validation time: 0:00:06
  ✅ Model saved: /content/drive/MyDrive/Encoder-only Transformer/finetuned_custom_epoch3.pt

파인튜닝 완료!


In [None]:
def predict_similarity_custom(code1, code2, model, tokenizer, device, max_len=384, pad_id=0):
    """
    커스텀 BPE 모델로 두 코드의 유사도 예측

    Returns:
        prediction (int): 0 (다른 문제) or 1 (같은 문제)
        confidence (float): 예측 확신도 (0~1)
    """
    model.eval()

    # 토크나이징
    encoding = tokenizer.encode(code1, code2)
    ids = encoding.ids

    if len(ids) > max_len:
        ids = ids[:max_len]

    attention_mask = [1] * len(ids)

    # 패딩
    padding_length = max_len - len(ids)
    ids = ids + [pad_id] * padding_length
    attention_mask = attention_mask + [0] * padding_length

    # 텐서 변환
    input_ids = torch.tensor([ids], dtype=torch.long).to(device)
    attention_mask = torch.tensor([attention_mask], dtype=torch.long).to(device)

    # 예측
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
        confidence = probs[0][prediction].item()

    return prediction, confidence

print("✅ 추론 함수 정의 완료!")

✅ 추론 함수 정의 완료!


In [None]:
# Epoch 3 모델 로드 (가장 성능이 좋았던 모델)
best_model_path = "/content/drive/MyDrive/Encoder-only Transformer/finetuned_custom_epoch3.pt"

model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

print(f"✅ 베스트 모델 로드 완료: {best_model_path}")

✅ 베스트 모델 로드 완료: /content/drive/MyDrive/Encoder-only Transformer/finetuned_custom_epoch3.pt


In [None]:
# 예시 1: BST 구현 - 같은 문제, 다른 스타일
code1_similar = """
class TreeNode:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

class BinarySearchTree:
    def __init__(self):
        self.root = None

    def insert(self, value):
        if not self.root:
            self.root = TreeNode(value)
        else:
            self._insert_recursive(self.root, value)

    def _insert_recursive(self, node, value):
        if value < node.value:
            if node.left is None:
                node.left = TreeNode(value)
            else:
                self._insert_recursive(node.left, value)
        else:
            if node.right is None:
                node.right = TreeNode(value)
            else:
                self._insert_recursive(node.right, value)

    def search(self, value):
        return self._search_recursive(self.root, value)

    def _search_recursive(self, node, value):
        if node is None:
            return False
        if node.value == value:
            return True
        elif value < node.value:
            return self._search_recursive(node.left, value)
        else:
            return self._search_recursive(node.right, value)

bst = BinarySearchTree()
elements = [50, 30, 70, 20, 40, 60, 80]
for elem in elements:
    bst.insert(elem)
print(bst.search(40))
"""

code2_similar = """
class Node:
    def __init__(self, data):
        self.data = data
        self.left_child = None
        self.right_child = None

class BST:
    def __init__(self):
        self.root_node = None

    def add_node(self, data):
        new_node = Node(data)
        if self.root_node is None:
            self.root_node = new_node
            return

        current = self.root_node
        while True:
            if data < current.data:
                if current.left_child is None:
                    current.left_child = new_node
                    break
                current = current.left_child
            else:
                if current.right_child is None:
                    current.right_child = new_node
                    break
                current = current.right_child

    def find_node(self, target):
        current = self.root_node
        while current is not None:
            if current.data == target:
                return True
            elif target < current.data:
                current = current.left_child
            else:
                current = current.right_child
        return False

tree = BST()
values = [50, 30, 70, 20, 40, 60, 80]
for val in values:
    tree.add_node(val)
print(tree.find_node(40))
"""

prediction, confidence = predict_similarity_custom(
    code1_similar, code2_similar, model, tokenizer, device, max_len=384, pad_id=PAD
)

print("=" * 80)
print("예시 1: 같은 문제 - 이진 탐색 트리(BST) 구현")
print("=" * 80)
print(f"Code 1 (재귀 방식):\n{code1_similar[:150]}...\n")
print(f"Code 2 (반복 방식):\n{code2_similar[:150]}...\n")
print(f"{'='*80}")
print(f"예측 결과: {'✅ 같은 문제 (Similar)' if prediction == 1 else '❌ 다른 문제 (Different)'}")
print(f"확신도: {confidence:.2%}")
print(f"{'='*80}")

예시 1: 같은 문제 - 이진 탐색 트리(BST) 구현
Code 1 (재귀 방식):

class TreeNode:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

class BinarySearchTree:...

Code 2 (반복 방식):

class Node:
    def __init__(self, data):
        self.data = data
        self.left_child = None
        self.right_child = None

class BST:
    def...

예측 결과: ✅ 같은 문제 (Similar)
확신도: 88.73%


In [None]:
# 예시 2: 완전히 다른 도메인
code1_different = """
import requests
from bs4 import BeautifulSoup

class WebScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.headers = {'User-Agent': 'Mozilla/5.0'}

    def fetch_page(self, url):
        response = requests.get(url, headers=self.headers)
        return response.text

    def parse_articles(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        articles = []
        for article in soup.find_all('article'):
            title = article.find('h2').text
            content = article.find('div', class_='content').text
            articles.append({'title': title, 'content': content})
        return articles

scraper = WebScraper('https://example.com')
html = scraper.fetch_page('https://example.com/page1')
articles = scraper.parse_articles(html)
"""

code2_different = """
import numpy as np
from PIL import Image

class ImageProcessor:
    def __init__(self, image_path):
        self.image = Image.open(image_path)
        self.array = np.array(self.image)

    def apply_grayscale(self):
        weights = np.array([0.299, 0.587, 0.114])
        gray = np.dot(self.array[...,:3], weights)
        return gray.astype(np.uint8)

    def detect_edges(self, threshold=100):
        gray = self.apply_grayscale()
        sobel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
        gradient = np.zeros_like(gray)
        for i in range(1, gray.shape[0]-1):
            for j in range(1, gray.shape[1]-1):
                region = gray[i-1:i+2, j-1:j+2]
                gradient[i,j] = np.sum(region * sobel_x)
        return gradient

processor = ImageProcessor('input.jpg')
edges = processor.detect_edges()
"""

prediction, confidence = predict_similarity_custom(
    code1_different, code2_different, model, tokenizer, device, max_len=384, pad_id=PAD
)

print("=" * 80)
print("예시 2: 다른 문제 - 웹 크롤링 vs 이미지 처리")
print("=" * 80)
print(f"Code 1 (웹 크롤링):\n{code1_different[:150]}...\n")
print(f"Code 2 (이미지 처리):\n{code2_different[:150]}...\n")
print(f"{'='*80}")
print(f"예측 결과: {'✅ 같은 문제 (Similar)' if prediction == 1 else '❌ 다른 문제 (Different)'}")
print(f"확신도: {confidence:.2%}")
print(f"{'='*80}")

예시 2: 다른 문제 - 웹 크롤링 vs 이미지 처리
Code 1 (웹 크롤링):

import requests
from bs4 import BeautifulSoup

class WebScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.head...

Code 2 (이미지 처리):

import numpy as np
from PIL import Image

class ImageProcessor:
    def __init__(self, image_path):
        self.image = Image.open(image_path)
     ...

예측 결과: ✅ 같은 문제 (Similar)
확신도: 80.12%
