In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer
import os

# Step 2. Embedding-based Model
- Use a pre-trained sentence embedding model (e.g., MiniLM, E5).
- Construct prompt+response embeddings and train a classifier.

In [None]:
try:
    train_df = pd.read_csv("./data/train.csv")
    test_df = pd.read_csv("./data/test.csv")
    sample_submission_df = pd.read_csv("./data/sample_submission.csv")
except FileNotFoundError as e:
    print(f"File Not Found: {e}")


def create_target(row):
    if row['winner_model_a'] == 1:
        return 0  # Class 0: A wins
    if row['winner_model_b'] == 1:
        return 1  # Class 1: B wins
    if row['winner_tie'] == 1:
        return 2  # Class 2: Tie
    return -1


train_df['y_target'] = train_df.apply(create_target, axis=1)
y = train_df['y_target']

print(f"target class distribution:\n{y.value_counts(normalize=True)}")

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

def create_embedding_features(df, model):

    texts_a = (df['prompt'] + " [SEP] " + df['response_a']).tolist()
    texts_b = (df['prompt'] + " [SEP] " + df['response_b']).tolist()

    embeddings_a = model.encode(texts_a, show_progress_bar=True, batch_size=256)
    embeddings_b = model.encode(texts_b, show_progress_bar=True, batch_size=256)

    print(f"embedding A Shape: {embeddings_a.shape}")
    print(f"embedding B Shape: {embeddings_b.shape}")

    X = np.concatenate([embeddings_a, embeddings_b], axis=1)

    print(f"Final feature vector Shape: {X.shape}")
    return X

X = create_embedding_features(train_df, model)
X_test = create_embedding_features(test_df, model)

print("Validation (Embedding Features)")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        random_state=42,
        max_iter=2000
    ))
])

model.fit(X_train, y_train)

val_preds_proba = model.predict_proba(X_val)
val_logloss = log_loss(y_val, val_preds_proba)
print(f"Validation DATA (Multiclass) Log Loss: {val_logloss:.4f}")

model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        random_state=42,
        max_iter=2000
    ))
])

model.fit(X, y)

test_preds_proba = model.predict_proba(X_test)

print(f"예측 확률 배열 shape: {test_preds_proba.shape}")

submission_df = pd.DataFrame({'id': test_df['id']})

submission_df['winner_model_a'] = test_preds_proba[:, 0]
submission_df['winner_model_b'] = test_preds_proba[:, 1]
submission_df['winner_tie'] = test_preds_proba[:, 2]


submission_df.to_csv(f"submission.csv", index=False)

print("Create Submission Completed")
print(submission_df.head())

print("\nSum of first prediction:", submission_df.iloc[0][['winner_model_a', 'winner_model_b', 'winner_tie']].sum())

target class distribution:
y_target
0    0.349079
1    0.341911
2    0.309011
Name: proportion, dtype: float64


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/225 [00:00<?, ?it/s]

Batches:   0%|          | 0/225 [00:00<?, ?it/s]

embedding A Shape: (57477, 384)
embedding B Shape: (57477, 384)
Final feature vector Shape: (57477, 768)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

embedding A Shape: (3, 384)
embedding B Shape: (3, 384)
Final feature vector Shape: (3, 768)
Validation (Embedding Features)




Validation DATA (Multiclass) Log Loss: 1.0637




예측 확률 배열 shape: (3, 3)
Create Submission Completed
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.273443        0.193083    0.533474
1   211333        0.265360        0.435535    0.299104
2  1233961        0.242280        0.474682    0.283037

Sum of first prediction: 1.0
