In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
import os

# Step 1. Baseline Model
- Build a simple baseline using lexical/length features and Logistic Regression.
- Submit to Kaggle to verify submission pipeline.

In [2]:
try:
    train_df = pd.read_csv("./data/train.csv")
    test_df = pd.read_csv("./data/test.csv")
    sample_submission_df = pd.read_csv("./data/sample_submission.csv")
except FileNotFoundError as e:
    print(f"파일을 찾을 수 없습니다: {e}")


print(f"학습 데이터 크기: {train_df.shape}")
print(f"테스트 데이터 크기: {test_df.shape}")
print("제출 샘플 컬럼:", sample_submission_df.columns.tolist())

def create_features(df):
    df['response_a_length'] = df['response_a'].str.len().fillna(0)
    df['response_b_length'] = df['response_b'].str.len().fillna(0)

    df['response_a_word_count'] = df['response_a'].str.split().str.len().fillna(0)
    df['response_b_word_count'] = df['response_b'].str.split().str.len().fillna(0)

    df['length_diff'] = df['response_a_length'] - df['response_b_length']
    df['word_count_diff'] = df['response_a_word_count'] - df['response_b_word_count']

    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

def create_target(row):
    if row['winner_model_a'] == 1:
        return 0  # Class 0: A wins
    if row['winner_model_b'] == 1:
        return 1  # Class 1: B wins
    if row['winner_tie'] == 1:
        return 2  # Class 2: Tie
    return -1

train_df['y_target'] = train_df.apply(create_target, axis=1)

features = ['length_diff', 'word_count_diff']
target = 'y_target'

X = train_df[features]
y = train_df[target]
X_test = test_df[features]

print(f"타겟 클래스 분포:\n{y.value_counts(normalize=True)}")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        random_state=42
    ))
])

model.fit(X_train, y_train)

val_preds_proba = model.predict_proba(X_val)
val_logloss = log_loss(y_val, val_preds_proba)
print(f"검증 데이터 (Multiclass) Log Loss: {val_logloss:.4f}")


model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        random_state=42
    ))
])

model.fit(X, y)

test_preds_proba = model.predict_proba(X_test)

print(f"예측 확률 배열 shape: {test_preds_proba.shape}")

submission_df = pd.DataFrame({'id': test_df['id']})

submission_df['winner_model_a'] = test_preds_proba[:, 0]
submission_df['winner_model_b'] = test_preds_proba[:, 1]
submission_df['winner_tie'] = test_preds_proba[:, 2]

output_dir = "./submission/step1"

os.makedirs(output_dir, exist_ok=True)

# 수정된 경로에 저장
submission_df.to_csv(f"{output_dir}/submission.csv", index=False)

print("제출 파일 'submission.csv' 생성이 완료되었습니다.")
print("제출 파일 예시:")
print(submission_df.head())

print("\n첫 번째 예측 확률 합:", submission_df.iloc[0][['winner_model_a', 'winner_model_b', 'winner_tie']].sum())

학습 데이터 크기: (57477, 9)
테스트 데이터 크기: (3, 4)
제출 샘플 컬럼: ['id', 'winner_model_a', 'winner_model_b', 'winner_tie']
타겟 클래스 분포:
y_target
0    0.349079
1    0.341911
2    0.309011
Name: proportion, dtype: float64
검증 데이터 (Multiclass) Log Loss: 1.0706
예측 확률 배열 shape: (3, 3)
제출 파일 'submission.csv' 생성이 완료되었습니다.
제출 파일 예시:
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.337449        0.345961    0.316589
1   211333        0.445383        0.246753    0.307864
2  1233961        0.408762        0.276031    0.315206

첫 번째 예측 확률 합: 1.0


