In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# 데이터 로드
train = pd.read_csv('/kaggle/input/open12312312121212/train.csv')
test  = pd.read_csv('/kaggle/input/open12312312121212/test.csv')

# 컬럼 정보
print("Train columns & dtypes:")
print(train.dtypes)
print("\nTrain 요약 통계:")
print(train.describe(include='all'))

# 샘플 데이터 확인
display(train.head())


In [None]:
# model_a / model_b 등장 빈도
print("=== model_a 빈도 ===")
print(train['model_a'].value_counts())
print("\n=== model_b 빈도 ===")
print(train['model_b'].value_counts())

In [None]:
# winner 컬럼을 하나의 label로 통합
def map_label(row):
    if row['winner_model_a'] == 1: return 0
    if row['winner_model_b'] == 1: return 1
    if row['winner_tie']    == 1: return 2
    return None

train['label'] = train.apply(map_label, axis=1)

# 라벨 분포 확인
print(train['label'].value_counts(normalize=True))
# 0:A 선호  1:B 선호  2:tie


In [None]:
# 1) 간단한 카테고리 생성
def prompt_type(txt):
    if txt.strip().endswith('?'): return 'question'
    # 키워드는 필요시 추가
    if any(kw in txt for kw in ['해줘','만들어','작성해','추천해']): 
        return 'command'
    return 'other'

train['prompt_type'] = train['prompt'].apply(prompt_type)

# 2) 각 타입별 A,B 승률(0,1 비율) 계산
res = (
    train
    .groupby('prompt_type')['label']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .rename(columns={0:'A_win_rate', 1:'B_win_rate', 2:'tie_rate'})
)
display(res)


In [None]:
print("Train 결측치 개수:")
print(train.isnull().sum())

print("\nTest 결측치 개수:")
print(test.isnull().sum())

In [None]:
import transformers
print("Transformers version:", transformers.__version__)

In [None]:
# 1) 라이브러리 불러오기
import time
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.special import softmax

# 2) 데이터 로드 및 라벨 맵핑
train_df = pd.read_csv('/kaggle/input/open12312312121212/train.csv')
test_df  = pd.read_csv('/kaggle/input/open12312312121212/test.csv')

def map_label(row):
    if row['winner_model_a'] == 1: return 0
    if row['winner_model_b'] == 1: return 1
    if row['winner_tie']    == 1: return 2
    return np.nan

train_df['label'] = train_df.apply(map_label, axis=1)
train_df = train_df.dropna(subset=['label']).astype({'label': 'int64'})

# 3) 훈련/검증 분할
train_pd, valid_pd = train_test_split(
    train_df[['prompt','response_a','response_b','label']],
    test_size=0.2,
    stratify=train_df['label'],
    random_state=42
)

# 4) HF Dataset 변환
hf_train = Dataset.from_pandas(train_pd.reset_index(drop=True))
hf_valid = Dataset.from_pandas(valid_pd.reset_index(drop=True))

# 5) 토크나이저 & 모델 로드
MODEL_NAME = 'distilbert-base-uncased'
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)
model      = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, ignore_mismatched_sizes=True
)

# 6) 전처리 함수
def preprocess(batch):
    texts = [
        f"Prompt: {p}\nA: {a}\nB: {b}"
        for p, a, b in zip(batch['prompt'], batch['response_a'], batch['response_b'])
    ]
    return tokenizer(texts, truncation=True, max_length=256, padding='max_length')

tokenized_train = hf_train.map(preprocess, batched=True)
tokenized_valid = hf_valid.map(preprocess, batched=True)

# 컬럼 정리
tokenized_train = tokenized_train.rename_column('label', 'labels')
tokenized_valid = tokenized_valid.rename_column('label', 'labels')

tokenized_train.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
tokenized_valid.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

# 7) 평가 지표 정의
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = softmax(logits, axis=1)
    return {
        'log_loss': log_loss(labels, probs, labels=[0,1,2]),
        'accuracy': accuracy_score(labels, np.argmax(logits, axis=-1))
    }

# 8) 최소 TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=3,
    logging_steps=100
)

# 9) Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics
)

# 10) 학습 실행
start = time.time()
trainer.train()
print(f"Training finished in {(time.time()-start)/60:.2f} min")

# 11) 수동 검증
metrics = trainer.evaluate()
print("Evaluation results:", metrics)

# 12) 테스트 예측 & 제출
hf_test = Dataset.from_pandas(test_df[['id','prompt','response_a','response_b']].reset_index(drop=True))
tokenized_test = hf_test.map(preprocess, batched=True)
tokenized_test.set_format(type='torch', columns=['input_ids','attention_mask'])

preds = trainer.predict(tokenized_test)
probs = softmax(preds.predictions, axis=1)

submission = pd.DataFrame({
    'id':               test_df['id'],
    'winner_model_a':   probs[:,0],
    'winner_model_b':   probs[:,1],
    'winner_tie':       probs[:,2],
})
submission.to_csv('submission.csv', index=False)
print("submission.csv 생성 완료")
