# Final Model

## Module Import

In [1]:
import re
from textblob import TextBlob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler

In [4]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss
import seaborn as sns
import matplotlib.pyplot as plt

In [18]:
from transformers import AutoTokenizer, AutoModel
import torch
from sentence_transformers import SentenceTransformer

## Data Import

In [6]:
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')
submission_df = pd.read_csv('../dataset/sample_submission.csv')

In [5]:
significant_features = [
    'len_diff', 'punc_diff',
    'sent_count_diff', 
    'lexical_div_diff',
    'repetition_diff', 
    # 'subjectivity_diff',
    'comma_ratio_diff', 
    # 'avg_word_len_diff'
]

target_words = [
    'company', 'brace', 'knee', 'progression', 
    'apologize', 'sorry'
]

In [7]:
def get_features(df):
    df['len_a'] = df['response_a'].str.len()
    df['len_b'] = df['response_b'].str.len()
    df['punc_a'] = df['response_a'].apply(lambda x: len(re.findall(r'[!?,;:]', str(x))))
    df['punc_b'] = df['response_b'].apply(lambda x: len(re.findall(r'[!?,;:]', str(x))))
    df['sent_a'] = df['response_a'].apply(lambda x: len(re.findall(r'[.!?]', str(x))))
    df['sent_b'] = df['response_b'].apply(lambda x: len(re.findall(r'[.!?]', str(x))))

    def ling_feat(text):
        words = re.findall(r'\b\w+\b', str(text).lower())
        uniq = set(words)
        lex_div = len(uniq)/(len(words)+1e-9)
        repetition = 1 - len(uniq)/(len(words)+1e-9)
        blob = TextBlob(str(text))
        subj = blob.sentiment.subjectivity
        return lex_div, repetition, subj
    
    for side in ['a', 'b']:
        df[[f'lex_{side}', f'rep_{side}', f'subj_{side}']] = df[f'response_{side}'].apply(
            lambda x: pd.Series(ling_feat(x))
        )
    
    df['comma_a'] = df['response_a'].apply(lambda x: len(re.findall(r'[;,]', str(x))) / (len(x.split()) + 1e-9))
    df['comma_b'] = df['response_b'].apply(lambda x: len(re.findall(r'[;,]', str(x))) / (len(x.split()) + 1e-9))
    # df['avglen_a'] = df['response_a'].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if len(str(x).split()) > 0 else 0)
    # df['avglen_b'] = df['response_b'].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if len(str(x).split()) > 0 else 0)

    # diff features
    df['len_diff'] = df['len_a'] - df['len_b']
    df['punc_diff'] = df['punc_a'] - df['punc_b']
    df['sent_count_diff'] = df['sent_a'] - df['sent_b']
    df['lexical_div_diff'] = df['lex_a'] - df['lex_b']
    df['repetition_diff'] = df['rep_a'] - df['rep_b']
    # df['subjectivity_diff'] = df['subj_a'] - df['subj_b']
    df['comma_ratio_diff'] = df['comma_a'] - df['comma_b']
    # df['avg_word_len_diff'] = df['avglen_a'] - df['avglen_b']
    
    # --- keyword presence ---
    text_cols = df[['prompt', 'response_a', 'response_b']].astype(str).agg(' '.join, axis=1)
    for word in target_words:
        df[f'contains_{word}'] = text_cols.str.contains(fr'\b{word}\b', case=False, na=False).astype(int)

    return df

In [8]:
train_df = get_features(train_df)
test_df = get_features(test_df)

In [9]:
train_df['label'] = np.select(
    [train_df['winner_model_a']==1, train_df['winner_model_b']==1, train_df['winner_tie']==1],
    [0,1,2]
)

In [10]:
keyword_features = [f'contains_{w}' for w in target_words]
all_features = significant_features + keyword_features

In [11]:
X = train_df[all_features]
y = train_df['label']

In [12]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer

# Î∂ÑÏÑù Î∞è ÌòºÌï© Ïä§ÏºÄÏùºÎßÅ(Ïó¥Î≥ÑÎ°ú StandardScaler / RobustScaler / Í∑∏ÎåÄÎ°ú Ïú†ÏßÄ)

# X, all_features Î≥ÄÏàòÍ∞Ä Ïù¥ÎØ∏ Ï°¥Ïû¨ÌïúÎã§Í≥† Í∞ÄÏ†ï
df = X.copy()

# Í∏∞Î≥∏ ÌÜµÍ≥ÑÎüâ & Ïù¥ÏÉÅÏπò/Î∂ÑÏÇ∞ Í≥ÑÏÇ∞
summary = []
for col in df.columns:
  vals = df[col].dropna().astype(float)
  n_unique = df[col].nunique(dropna=True)
  is_binary = set(vals.unique()).issubset({0.0, 1.0}) or n_unique == 2
  var = vals.var()
  skew = vals.skew()
  q1, q3 = vals.quantile(0.25), vals.quantile(0.75)
  iqr = q3 - q1
  lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
  outlier_mask = (vals < lower) | (vals > upper)
  outlier_ratio = outlier_mask.sum() / max(len(vals), 1)
  low_variance = var < 1e-4  # ÏûÑÍ≥ÑÍ∞íÏùÄ Îç∞Ïù¥ÌÑ∞Ïóê Îî∞Îùº Ï°∞Ï†ï Í∞ÄÎä•
  high_skew = abs(skew) > 1.0

  # Ïä§ÏºÄÏùºÎü¨ Í∂åÏû•: Ïù¥ÏßÑ Î≥ÄÏàòÎäî Í∑∏ÎåÄÎ°ú, Ïù¥ÏÉÅÏπòÍ∞Ä ÎßéÍ±∞ÎÇò ÏôúÎèÑÍ∞Ä ÌÅ∞ Í≤ΩÏö∞ Robust, ÏïÑÎãàÎ©¥ Standard
  if is_binary:
    recommended = 'passthrough (binary)'
  elif outlier_ratio > 0.05 or high_skew:
    recommended = 'RobustScaler'
  else:
    recommended = 'StandardScaler'

  summary.append({
    'feature': col,
    'dtype': str(df[col].dtype),
    'n_unique': n_unique,
    'is_binary': is_binary,
    'var': var,
    'skew': skew,
    'q1': q1,
    'q3': q3,
    'iqr': iqr,
    'outlier_ratio': outlier_ratio,
    'low_variance': low_variance,
    'recommended': recommended
  })

summary_df = pd.DataFrame(summary).set_index('feature')
print("Feature summary (first rows):")
print(summary_df[['dtype', 'is_binary', 'n_unique', 'var', 'skew', 'outlier_ratio', 'low_variance', 'recommended']].head(20))

# Í∑∏Î£π Î∂ÑÎ•ò
binary_cols = summary_df[summary_df['is_binary']].index.tolist()
robust_cols = summary_df[summary_df['recommended'] == 'RobustScaler'].index.difference(binary_cols).tolist()
std_cols = summary_df[summary_df['recommended'] == 'StandardScaler'].index.difference(binary_cols).tolist()

print("\nGroups:")
print("  binary_cols:", binary_cols)
print("  robust_cols:", robust_cols)
print("  std_cols:", std_cols)

# ColumnTransformer Íµ¨ÏÑ± (Îπà Í∑∏Î£πÏùÄ ÏÉùÎûµ)
transformers = []
if std_cols:
  transformers.append(('std', StandardScaler(), std_cols))
if robust_cols:
  transformers.append(('robust', RobustScaler(), robust_cols))
# passthrough for binary (Ïú†ÏßÄ)
if binary_cols:
  transformers.append(('passthrough_binary', 'passthrough', binary_cols))

if not transformers:
  raise RuntimeError("No features to transform. Check X/all_features.")

col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0)

# fit + transform -> DataFrameÏúºÎ°ú Î≥µÏõê
X_scaled_arr = col_transformer.fit_transform(df)
# ColumnTransformer ÏàúÏÑúÎ•º Ïù¥Ïö©Ìï¥ Ïª¨ÎüºÎ™Ö Ïû¨Íµ¨ÏÑ±
out_cols = []
for name, _, cols in transformers:
  if cols == 'passthrough' or name.startswith('passthrough'):
    # passthrough ÏÇ¨Ïö©Ìïú Í≤ΩÏö∞, Ïã§Ï†ú Ïª¨Îüº ÏàúÏÑúÍ∞Ä ÏûÖÎ†• dfÏóêÏÑú Ïú†ÏßÄÎêòÎØÄÎ°ú colsÎäî Î¶¨Ïä§Ìä∏Î°ú Î∞õÏùå
    out_cols.extend(cols if isinstance(cols, (list, tuple)) else list(cols))
  else:
    out_cols.extend(cols)
X_scaled_df = pd.DataFrame(X_scaled_arr, columns=out_cols, index=df.index)

Feature summary (first rows):
                        dtype  is_binary  n_unique           var        skew  \
feature                                                                        
len_diff                int64      False      6041  1.205707e+06   -1.124580   
punc_diff               int64      False       313  3.658851e+02    6.378323   
sent_count_diff         int64      False       255  1.972592e+02   -2.359540   
lexical_div_diff      float64      False     55950  3.080705e-02    0.025245   
repetition_diff       float64      False     55950  3.080705e-02   -0.025245   
comma_ratio_diff      float64      False     51816  1.734652e+00 -165.425305   
contains_company        int64       True         2  4.487974e-02    4.275875   
contains_brace          int64       True         2  2.879821e-03   18.527488   
contains_knee           int64       True         2  3.778502e-03   16.145383   
contains_progression    int64       True         2  7.699548e-03   11.219919   
contains_a

In [13]:
X_scaled_df

Unnamed: 0,comma_ratio_diff,len_diff,lexical_div_diff,punc_diff,repetition_diff,sent_count_diff,contains_company,contains_brace,contains_knee,contains_progression,contains_apologize,contains_sorry
0,-0.509879,3.952550,-0.550103,3.333333,0.550103,3.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.247947,-0.634638,-0.713681,-0.750000,0.713681,-1.777778,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.727202,-1.084223,0.816672,-1.250000,-0.816672,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0
3,0.382777,1.921708,-0.593601,1.666667,0.593601,1.222222,0.0,0.0,0.0,0.0,0.0,0.0
4,0.069289,0.626335,-0.709755,0.833333,0.709755,0.222222,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
57472,1.690206,-0.190985,1.050461,0.250000,-1.050461,1.000000,0.0,0.0,0.0,0.0,0.0,0.0
57473,0.436278,-0.007117,0.239260,0.000000,-0.239260,-1.222222,0.0,0.0,0.0,0.0,0.0,0.0
57474,-0.402496,8.451957,-0.398586,4.416667,0.398586,10.111111,0.0,0.0,0.0,0.0,0.0,0.0
57475,0.232915,-0.633452,0.009375,-0.333333,-0.009375,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0


## SEMANTIC Í≤∞Ìï©

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def clean_prompt(s):
    return re.sub(r'["\'\[\]]', '', str(s))

train_df['prompt_clean'] = train_df['prompt'].apply(clean_prompt)
test_df['prompt_clean']  = test_df['prompt'].apply(clean_prompt)

# Prompt + Response Ìï©ÏÑ±
train_df['text_a'] = train_df['prompt_clean'] + ' ' + train_df['response_a']
train_df['text_b'] = train_df['prompt_clean'] + ' ' + train_df['response_b']
test_df['text_a']  = test_df['prompt_clean'] + ' ' + test_df['response_a']
test_df['text_b']  = test_df['prompt_clean'] + ' ' + test_df['response_b']

# Embedding Ï∂îÏ∂ú
emb_a_train = model.encode(train_df['text_a'].tolist(), show_progress_bar=True)
emb_b_train = model.encode(train_df['text_b'].tolist(), show_progress_bar=True)
emb_a_test  = model.encode(test_df['text_a'].tolist(), show_progress_bar=True)
emb_b_test  = model.encode(test_df['text_b'].tolist(), show_progress_bar=True)

# ÏùëÎãµ Í∞Ñ Ï∞®Ïù¥ Î≤°ÌÑ∞ + prompt ÏùòÎØ∏ Ï∂îÍ∞Ä
emb_diff_train = emb_a_train - emb_b_train
emb_diff_test  = emb_a_test  - emb_b_test

prompt_emb_train = model.encode(train_df['prompt_clean'].tolist(), show_progress_bar=True)
prompt_emb_test  = model.encode(test_df['prompt_clean'].tolist(), show_progress_bar=True)

In [None]:
X_train_hybrid = np.concatenate([emb_diff_train, prompt_emb_train, X_tab.values], axis=1)
X_test_hybrid  = np.concatenate([emb_diff_test,  prompt_emb_test,  X_tab.values[:len(test_df)]], axis=1)

# Target ÏÉùÏÑ±
conditions = [
    train_df['winner_model_a'] == 1,
    train_df['winner_model_b'] == 1,
    train_df['winner_tie'] == 1
]
y_train = np.select(conditions, [0, 1, 2], default=-1)

ÎòêÎäî train_hybrid_data.npz Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞

In [22]:
X_train, X_val, y_train_split, y_val = train_test_split(
    X_train_hybrid, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

lr = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial', C=0.5)
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingClassifier(random_state=42)
lgb = LGBMClassifier(
    n_estimators=400,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lexical_models = [("lr", lr), ("rf", rf), ("gb", gb), ("lgb", lgb)]

In [None]:
hybrid_voting = VotingClassifier(
    estimators=lexical_models,
    voting='soft'
)

print("üöÄ Training lexical ensemble...\n")
hybrid_voting.fit(X_train, y_train_split)
print("‚úÖ Lexical ensemble training complete.\n")

# Validation check
y_val_pred = hybrid_voting.predict(X_val)
y_val_prob = hybrid_voting.predict_proba(X_val)

acc = accuracy_score(y_val, y_val_pred)
loss = log_loss(y_val, y_val_prob)
print(f"Validation Accuracy: {acc:.4f}")
print(f"Validation Log Loss: {loss:.4f}")

In [None]:
y_pred = hybrid_voting.predict(X_val)
y_prob = hybrid_voting.predict_proba(X_val)

acc = accuracy_score(y_val, y_pred)
loss = log_loss(y_val, y_prob)

print(f"\nüìä Validation Accuracy: {acc:.4f}")
print(f"üìâ Validation LogLoss: {loss:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['A win', 'B win', 'Tie']))

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()