In [1]:
!pip install transformers torch pandas numpy scikit-learn tqdm rank-bm25 -q

In [2]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import zipfile
import os

# 압축 해제
zip_path = "/content/drive/MyDrive/RoBERTa-small/open.zip"
extract_path = "/content/model_files"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 파일 구조 확인
!ls -la /content/model_files

total 158776
drwxr-xr-x   3 root root      4096 Oct 20 06:10 .
drwxr-xr-x   1 root root      4096 Oct 20 06:10 ..
drwxr-xr-x 302 root root     12288 Oct 20 06:10 code
-rw-r--r--   1 root root   1685911 Oct 20 06:10 sample_submission.csv
-rw-r--r--   1 root root  14808195 Oct 20 06:10 sample_train.csv
-rw-r--r--   1 root root 146065471 Oct 20 06:10 test.csv


In [4]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW  # torch에서 직접 임포트
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
import datetime
import random

# 재현성을 위한 시드 설정
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)
print("라이브러리 임포트 완료!")

라이브러리 임포트 완료!


In [5]:
# 모델 디렉토리
model_dir = "/content/drive/MyDrive/RoBERTa-small"

# 토크나이저 로드
tokenizer = RobertaTokenizerFast.from_pretrained(model_dir)
tokenizer.truncation_side = "left"

# 모델 로드
model = RobertaForSequenceClassification.from_pretrained(
    model_dir,
    num_labels=2,
    ignore_mismatched_sizes=True
)

# GPU로 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"✅ Model loaded successfully!")
print(f"✅ Device: {device}")
print(f"✅ Tokenizer vocab size: {len(tokenizer)}")
print(f"✅ Model parameters: {sum(p.numel() for p in model.parameters()):,}")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'RobertaTokenizerFast'.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/RoBERTa-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded successfully!
✅ Device: cuda
✅ Tokenizer vocab size: 32000
✅ Model parameters: 35,826,690


In [6]:
# 데이터 경로 설정
data_path = "/content/model_files"

# 학습 데이터 로드
train_df = pd.read_csv(f"{data_path}/sample_train.csv")
test_df = pd.read_csv(f"{data_path}/test.csv")

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("\n=== Train Data Sample ===")
print(train_df.head())
print(f"\n=== Label Distribution ===")
print(train_df['similar'].value_counts())

Train data shape: (17970, 3)
Test data shape: (179700, 3)

=== Train Data Sample ===
                                               code1  \
0  flag = "go"\ncnt = 0\nwhile flag == "go":\n   ...   
1   b, c = map(int, input().split())\n\nprint(b * c)   
2  import numpy as np\nimport sys\nread = sys.std...   
3   b, c = map(int, input().split())\n\nprint(b * c)   
4  s=input()\nt=input()\nans=0\nfor i in range(le...   

                                               code2  similar  
0  # Python 3+\n#--------------------------------...        1  
1  import numpy as np\n\nn = int(input())\na = np...        0  
2  N, M = map(int, input().split())\nif M%2 != 0:...        0  
3  n,m=map(int,input().split())\nh=list(map(int,i...        0  
4  import math\na,b,h,m=map(int,input().split())\...        0  

=== Label Distribution ===
similar
1    9005
0    8965
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split

# Train/Validation 분할 (80:20)
train_data, val_data = train_test_split(
    train_df,
    test_size=0.2,
    random_state=42,
    stratify=train_df['similar']  # 레이블 비율 유지
)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"\nTrain label distribution:\n{train_data['similar'].value_counts()}")
print(f"\nValidation label distribution:\n{val_data['similar'].value_counts()}")

Train size: 14376
Validation size: 3594

Train label distribution:
similar
1    7204
0    7172
Name: count, dtype: int64

Validation label distribution:
similar
1    1801
0    1793
Name: count, dtype: int64


In [8]:
def format_time(elapsed):
    """시간 포맷팅"""
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def flat_accuracy(preds, labels):
    """정확도 계산"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

print("유틸리티 함수 정의 완료!")

유틸리티 함수 정의 완료!


In [9]:
MAX_LEN = 512

print("Train 데이터 토크나이징 중...")
train_encodings = tokenizer(
    train_data['code1'].tolist(),
    train_data['code2'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

train_labels = torch.tensor(train_data['similar'].values)

print(f"✅ Train input_ids shape: {train_encodings['input_ids'].shape}")
print(f"✅ Train labels shape: {train_labels.shape}")

Train 데이터 토크나이징 중...
✅ Train input_ids shape: torch.Size([14376, 512])
✅ Train labels shape: torch.Size([14376])


In [10]:
print("Validation 데이터 토크나이징 중...")
val_encodings = tokenizer(
    val_data['code1'].tolist(),
    val_data['code2'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

val_labels = torch.tensor(val_data['similar'].values)

print(f"✅ Validation input_ids shape: {val_encodings['input_ids'].shape}")
print(f"✅ Validation labels shape: {val_labels.shape}")

Validation 데이터 토크나이징 중...
✅ Validation input_ids shape: torch.Size([3594, 512])
✅ Validation labels shape: torch.Size([3594])


In [11]:
BATCH_SIZE = 16

# Train DataLoader
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_labels
)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

# Validation DataLoader
val_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    val_labels
)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=BATCH_SIZE)

print(f"✅ Train batches: {len(train_dataloader)}")
print(f"✅ Validation batches: {len(val_dataloader)}")

✅ Train batches: 899
✅ Validation batches: 225


In [12]:
EPOCHS = 3
LEARNING_RATE = 2e-5

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)

# Scheduler
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

print(f"✅ Total training steps: {total_steps}")
print(f"✅ Learning rate: {LEARNING_RATE}")

✅ Total training steps: 2697
✅ Learning rate: 2e-05


In [13]:
# 손실 함수
loss_fn = nn.CrossEntropyLoss()

# 결과 저장용
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []

print("=" * 50)
print("파인튜닝 시작!")
print("=" * 50)

for epoch in range(EPOCHS):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"{'='*50}")

    # ========== Training ==========
    print("\n[Training]")
    t0 = time.time()
    model.train()

    total_train_loss = 0
    total_train_acc = 0

    for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        # Forward pass
        outputs = model(
            b_input_ids,
            attention_mask=b_attention_mask,
            labels=b_labels
        )

        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Accuracy 계산
        logits_np = logits.detach().cpu().numpy()
        labels_np = b_labels.cpu().numpy()
        total_train_acc += flat_accuracy(logits_np, labels_np)

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_acc = total_train_acc / len(train_dataloader)

    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_acc)

    print(f"  Average training loss: {avg_train_loss:.4f}")
    print(f"  Average training accuracy: {avg_train_acc:.4f}")
    print(f"  Training time: {format_time(time.time() - t0)}")

    # ========== Validation ==========
    print("\n[Validation]")
    t0 = time.time()
    model.eval()

    total_val_loss = 0
    total_val_acc = 0

    for batch in tqdm(val_dataloader, desc="Validating"):
        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                attention_mask=b_attention_mask,
                labels=b_labels
            )

        loss = outputs.loss
        logits = outputs.logits

        total_val_loss += loss.item()

        logits_np = logits.cpu().numpy()
        labels_np = b_labels.cpu().numpy()
        total_val_acc += flat_accuracy(logits_np, labels_np)

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = total_val_acc / len(val_dataloader)

    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_acc)

    print(f"  Average validation loss: {avg_val_loss:.4f}")
    print(f"  Average validation accuracy: {avg_val_acc:.4f}")
    print(f"  Validation time: {format_time(time.time() - t0)}")

    # 모델 저장
    save_path = f"/content/drive/MyDrive/CodeNetData/finetuned_epoch{epoch+1}.pt"
    torch.save(model.state_dict(), save_path)
    print(f"  ✅ Model saved: {save_path}")

print("\n" + "="*50)
print("파인튜닝 완료!")
print("="*50)

파인튜닝 시작!

Epoch 1/3

[Training]


Training:   0%|          | 0/899 [00:00<?, ?it/s]

  Average training loss: 0.5491
  Average training accuracy: 0.6987
  Training time: 0:05:42

[Validation]


Validating:   0%|          | 0/225 [00:00<?, ?it/s]

  Average validation loss: 0.3433
  Average validation accuracy: 0.8598
  Validation time: 0:00:25
  ✅ Model saved: /content/drive/MyDrive/CodeNetData/finetuned_epoch1.pt

Epoch 2/3

[Training]


Training:   0%|          | 0/899 [00:00<?, ?it/s]

  Average training loss: 0.2392
  Average training accuracy: 0.9067
  Training time: 0:05:45

[Validation]


Validating:   0%|          | 0/225 [00:00<?, ?it/s]

  Average validation loss: 0.1766
  Average validation accuracy: 0.9326
  Validation time: 0:00:25
  ✅ Model saved: /content/drive/MyDrive/CodeNetData/finetuned_epoch2.pt

Epoch 3/3

[Training]


Training:   0%|          | 0/899 [00:00<?, ?it/s]

  Average training loss: 0.1318
  Average training accuracy: 0.9525
  Training time: 0:05:45

[Validation]


Validating:   0%|          | 0/225 [00:00<?, ?it/s]

  Average validation loss: 0.2042
  Average validation accuracy: 0.9276
  Validation time: 0:00:25
  ✅ Model saved: /content/drive/MyDrive/CodeNetData/finetuned_epoch3.pt

파인튜닝 완료!


In [14]:
def predict_similarity(code1, code2, model, tokenizer, device):
    """
    두 코드의 유사도를 예측하는 함수

    Returns:
        prediction (int): 0 (다른 문제) or 1 (같은 문제)
        confidence (float): 예측 확신도 (0~1)
    """
    model.eval()

    # 토크나이징
    encoding = tokenizer(
        code1,
        code2,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # 예측
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
        confidence = probs[0][prediction].item()

    return prediction, confidence

print("✅ 추론 함수 정의 완료!")

✅ 추론 함수 정의 완료!


In [15]:
# Epoch 2 모델 로드 (가장 성능이 좋았던 모델)
best_model_path = "/content/drive/MyDrive/CodeNetData/finetuned_epoch2.pt"

model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

print(f"✅ 베스트 모델 로드 완료: {best_model_path}")

✅ 베스트 모델 로드 완료: /content/drive/MyDrive/CodeNetData/finetuned_epoch2.pt


In [16]:
# 예시 1: 두 개의 합을 구하는 문제 (유사함)
code1_similar = """
def solution(a, b):
    return a + b

result = solution(3, 5)
print(result)
"""

code2_similar = """
def add_numbers(x, y):
    total = x + y
    return total

ans = add_numbers(3, 5)
print(ans)
"""

prediction, confidence = predict_similarity(code1_similar, code2_similar, model, tokenizer, device)

print("=" * 60)
print("예시 1: 같은 문제를 푸는 코드 (두 수의 합)")
print("=" * 60)
print(f"Code 1:\n{code1_similar}")
print(f"\nCode 2:\n{code2_similar}")
print(f"\n{'='*60}")
print(f"예측 결과: {'✅ 같은 문제' if prediction == 1 else '❌ 다른 문제'}")
print(f"확신도: {confidence:.2%}")
print(f"{'='*60}")

예시 1: 같은 문제를 푸는 코드 (두 수의 합)
Code 1:

def solution(a, b):
    return a + b

result = solution(3, 5)
print(result)


Code 2:

def add_numbers(x, y):
    total = x + y
    return total

ans = add_numbers(3, 5)
print(ans)


예측 결과: ✅ 같은 문제
확신도: 99.45%


In [20]:
# 예시 2: 완전히 다른 문제 (유사하지 않음)
code1_different = """
def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n-1)

print(factorial(5))
"""

code2_different = """
def is_prime(num):
    if num < 2:
        return False
    for i in range(2, int(num**0.5) + 1):
        if num % i == 0:
            return False
    return True

print(is_prime(17))
"""

prediction, confidence = predict_similarity(code1_different, code2_different, model, tokenizer, device)

print("=" * 60)
print("예시 2: 다른 문제를 푸는 코드 (팩토리얼 vs 소수판별)")
print("=" * 60)
print(f"Code 1:\n{code1_different}")
print(f"\nCode 2:\n{code2_different}")
print(f"\n{'='*60}")
print(f"예측 결과: {'✅ 같은 문제' if prediction == 1 else '❌ 다른 문제'}")
print(f"확신도: {confidence:.2%}")
print(f"{'='*60}")

예시 2: 다른 문제를 푸는 코드 (팩토리얼 vs 소수판별)
Code 1:

def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n-1)

print(factorial(5))


Code 2:

def is_prime(num):
    if num < 2:
        return False
    for i in range(2, int(num**0.5) + 1):
        if num % i == 0:
            return False
    return True

print(is_prime(17))


예측 결과: ✅ 같은 문제
확신도: 99.93%


In [18]:
# 예시 1: BST (Binary Search Tree) 구현 - 같은 문제, 다른 스타일
code1_similar = """
class TreeNode:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

class BinarySearchTree:
    def __init__(self):
        self.root = None

    def insert(self, value):
        if not self.root:
            self.root = TreeNode(value)
        else:
            self._insert_recursive(self.root, value)

    def _insert_recursive(self, node, value):
        if value < node.value:
            if node.left is None:
                node.left = TreeNode(value)
            else:
                self._insert_recursive(node.left, value)
        else:
            if node.right is None:
                node.right = TreeNode(value)
            else:
                self._insert_recursive(node.right, value)

    def search(self, value):
        return self._search_recursive(self.root, value)

    def _search_recursive(self, node, value):
        if node is None:
            return False
        if node.value == value:
            return True
        elif value < node.value:
            return self._search_recursive(node.left, value)
        else:
            return self._search_recursive(node.right, value)

    def inorder_traversal(self):
        result = []
        self._inorder_helper(self.root, result)
        return result

    def _inorder_helper(self, node, result):
        if node:
            self._inorder_helper(node.left, result)
            result.append(node.value)
            self._inorder_helper(node.right, result)

bst = BinarySearchTree()
elements = [50, 30, 70, 20, 40, 60, 80]
for elem in elements:
    bst.insert(elem)
print(bst.inorder_traversal())
print(bst.search(40))
"""

code2_similar = """
class Node:
    def __init__(self, data):
        self.data = data
        self.left_child = None
        self.right_child = None

class BST:
    def __init__(self):
        self.root_node = None

    def add_node(self, data):
        new_node = Node(data)
        if self.root_node is None:
            self.root_node = new_node
            return

        current = self.root_node
        while True:
            if data < current.data:
                if current.left_child is None:
                    current.left_child = new_node
                    break
                current = current.left_child
            else:
                if current.right_child is None:
                    current.right_child = new_node
                    break
                current = current.right_child

    def find_node(self, target):
        current = self.root_node
        while current is not None:
            if current.data == target:
                return True
            elif target < current.data:
                current = current.left_child
            else:
                current = current.right_child
        return False

    def get_sorted_elements(self):
        elements = []

        def traverse(node):
            if node is not None:
                traverse(node.left_child)
                elements.append(node.data)
                traverse(node.right_child)

        traverse(self.root_node)
        return elements

tree = BST()
values = [50, 30, 70, 20, 40, 60, 80]
for val in values:
    tree.add_node(val)
print(tree.get_sorted_elements())
print(tree.find_node(40))
"""

prediction, confidence = predict_similarity(code1_similar, code2_similar, model, tokenizer, device)

print("=" * 80)
print("예시 1: 같은 문제 - 이진 탐색 트리(BST) 구현")
print("=" * 80)
print(f"Code 1 (재귀 방식):\n{code1_similar[:200]}...\n")
print(f"Code 2 (반복 방식):\n{code2_similar[:200]}...\n")
print(f"{'='*80}")
print(f"예측 결과: {'✅ 같은 문제 (Similar)' if prediction == 1 else '❌ 다른 문제 (Different)'}")
print(f"확신도: {confidence:.2%}")
print(f"{'='*80}")

예시 1: 같은 문제 - 이진 탐색 트리(BST) 구현
Code 1 (재귀 방식):

class TreeNode:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

class BinarySearchTree:
    def __init__(self):
        self.root = None
...

Code 2 (반복 방식):

class Node:
    def __init__(self, data):
        self.data = data
        self.left_child = None
        self.right_child = None

class BST:
    def __init__(self):
        self.root_node = None
   ...

예측 결과: ✅ 같은 문제 (Similar)
확신도: 98.72%


In [19]:
# 예시 2: 완전히 다른 알고리즘 문제
code1_different = """
from collections import defaultdict, deque

class Graph:
    def __init__(self, vertices):
        self.vertices = vertices
        self.adjacency_list = defaultdict(list)
        self.visited = set()

    def add_edge(self, source, destination):
        self.adjacency_list[source].append(destination)
        self.adjacency_list[destination].append(source)

    def depth_first_search(self, start_vertex):
        path = []
        self.visited.clear()
        self._dfs_recursive(start_vertex, path)
        return path

    def _dfs_recursive(self, vertex, path):
        self.visited.add(vertex)
        path.append(vertex)

        for neighbor in sorted(self.adjacency_list[vertex]):
            if neighbor not in self.visited:
                self._dfs_recursive(neighbor, path)

    def breadth_first_search(self, start_vertex):
        visited = set([start_vertex])
        queue = deque([start_vertex])
        path = []

        while queue:
            vertex = queue.popleft()
            path.append(vertex)

            for neighbor in sorted(self.adjacency_list[vertex]):
                if neighbor not in visited:
                    visited.add(neighbor)
                    queue.append(neighbor)

        return path

    def is_connected(self, vertex1, vertex2):
        if vertex1 not in self.adjacency_list:
            return False

        visited = set()
        stack = [vertex1]

        while stack:
            current = stack.pop()
            if current == vertex2:
                return True

            if current not in visited:
                visited.add(current)
                stack.extend(self.adjacency_list[current])

        return False

g = Graph(7)
edges = [(0, 1), (0, 2), (1, 3), (1, 4), (2, 5), (2, 6)]
for src, dst in edges:
    g.add_edge(src, dst)

print(g.depth_first_search(0))
print(g.breadth_first_search(0))
print(g.is_connected(0, 6))
"""

code2_different = """
def longest_common_subsequence(text1, text2):
    m, n = len(text1), len(text2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if text1[i-1] == text2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])

    return dp[m][n]

def edit_distance(word1, word2):
    m, n = len(word1), len(word2)
    dp_table = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp_table[i][0] = i
    for j in range(n + 1):
        dp_table[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if word1[i-1] == word2[j-1]:
                dp_table[i][j] = dp_table[i-1][j-1]
            else:
                dp_table[i][j] = 1 + min(
                    dp_table[i-1][j],
                    dp_table[i][j-1],
                    dp_table[i-1][j-1]
                )

    return dp_table[m][n]

def knapsack_problem(weights, values, capacity):
    n = len(weights)
    dp = [[0] * (capacity + 1) for _ in range(n + 1)]

    for i in range(1, n + 1):
        for w in range(1, capacity + 1):
            if weights[i-1] <= w:
                dp[i][w] = max(
                    values[i-1] + dp[i-1][w - weights[i-1]],
                    dp[i-1][w]
                )
            else:
                dp[i][w] = dp[i-1][w]

    return dp[n][capacity]

str1, str2 = "ABCDGH", "AEDFHR"
print(longest_common_subsequence(str1, str2))

word_a, word_b = "horse", "ros"
print(edit_distance(word_a, word_b))

w = [10, 20, 30]
v = [60, 100, 120]
cap = 50
print(knapsack_problem(w, v, cap))
"""

prediction, confidence = predict_similarity(code1_different, code2_different, model, tokenizer, device)

print("=" * 80)
print("예시 2: 다른 문제 - 그래프 탐색(DFS/BFS) vs 동적계획법(DP)")
print("=" * 80)
print(f"Code 1 (그래프 알고리즘):\n{code1_different[:200]}...\n")
print(f"Code 2 (동적계획법):\n{code2_different[:200]}...\n")
print(f"{'='*80}")
print(f"예측 결과: {'✅ 같은 문제 (Similar)' if prediction == 1 else '❌ 다른 문제 (Different)'}")
print(f"확신도: {confidence:.2%}")
print(f"{'='*80}")

예시 2: 다른 문제 - 그래프 탐색(DFS/BFS) vs 동적계획법(DP)
Code 1 (그래프 알고리즘):

from collections import defaultdict, deque

class Graph:
    def __init__(self, vertices):
        self.vertices = vertices
        self.adjacency_list = defaultdict(list)
        self.visited = set(...

Code 2 (동적계획법):

def longest_common_subsequence(text1, text2):
    m, n = len(text1), len(text2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
   ...

예측 결과: ✅ 같은 문제 (Similar)
확신도: 96.78%


In [21]:
# 예시 3: 완전히 다른 도메인 (웹 크롤링 vs 이미지 처리)
code1_web = """
import requests
from bs4 import BeautifulSoup
import time
import json

class WebScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.headers = {
            'User-Agent': 'Mozilla/5.0'
        }
        self.session = requests.Session()

    def fetch_page(self, url):
        try:
            response = self.session.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None

    def parse_articles(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        articles = []

        for article in soup.find_all('article', class_='post'):
            title = article.find('h2', class_='title')
            author = article.find('span', class_='author')
            date = article.find('time', class_='published')
            content = article.find('div', class_='content')

            if title and content:
                articles.append({
                    'title': title.text.strip(),
                    'author': author.text.strip() if author else 'Unknown',
                    'date': date.get('datetime') if date else None,
                    'content': content.text.strip()[:200]
                })

        return articles

    def scrape_multiple_pages(self, num_pages):
        all_articles = []

        for page in range(1, num_pages + 1):
            url = f"{self.base_url}?page={page}"
            html = self.fetch_page(url)

            if html:
                articles = self.parse_articles(html)
                all_articles.extend(articles)
                print(f"Scraped page {page}: {len(articles)} articles")

            time.sleep(1)

        return all_articles

scraper = WebScraper('https://example.com/blog')
results = scraper.scrape_multiple_pages(5)
print(json.dumps(results, indent=2))
"""

code2_image = """
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

class ImageProcessor:
    def __init__(self, image_path):
        self.image = Image.open(image_path)
        self.array = np.array(self.image)

    def apply_grayscale(self):
        if len(self.array.shape) == 3:
            weights = np.array([0.299, 0.587, 0.114])
            gray = np.dot(self.array[...,:3], weights)
            return gray.astype(np.uint8)
        return self.array

    def apply_gaussian_blur(self, kernel_size=5):
        from scipy.ndimage import gaussian_filter
        blurred = gaussian_filter(self.array, sigma=kernel_size/3)
        return blurred.astype(np.uint8)

    def detect_edges(self, threshold1=100, threshold2=200):
        gray = self.apply_grayscale()

        sobel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
        sobel_y = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]])

        grad_x = np.zeros_like(gray)
        grad_y = np.zeros_like(gray)

        for i in range(1, gray.shape[0]-1):
            for j in range(1, gray.shape[1]-1):
                region = gray[i-1:i+2, j-1:j+2]
                grad_x[i,j] = np.sum(region * sobel_x)
                grad_y[i,j] = np.sum(region * sobel_y)

        magnitude = np.sqrt(grad_x**2 + grad_y**2)
        edges = (magnitude > threshold1).astype(np.uint8) * 255
        return edges

    def resize_image(self, new_width, new_height):
        resized = self.image.resize((new_width, new_height), Image.LANCZOS)
        return np.array(resized)

processor = ImageProcessor('input.jpg')
gray_img = processor.apply_grayscale()
edges = processor.detect_edges()
resized = processor.resize_image(800, 600)
"""

prediction, confidence = predict_similarity(code1_web, code2_image, model, tokenizer, device)

print("=" * 80)
print("예시 3: 완전히 다른 도메인 - 웹 크롤링 vs 이미지 처리")
print("=" * 80)
print(f"예측 결과: {'✅ 같은 문제 (Similar)' if prediction == 1 else '❌ 다른 문제 (Different)'}")
print(f"확신도: {confidence:.2%}")
print(f"{'='*80}")

예시 3: 완전히 다른 도메인 - 웹 크롤링 vs 이미지 처리
예측 결과: ❌ 다른 문제 (Different)
확신도: 53.59%
