In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score


In [3]:
def target_encode_oof(
        train:pd.DataFrame,
        test:pd.DataFrame,
        cat_cols,
        y_col: str,
        n_splits: int = 5,
        random_state: int = 42,
        smooth: float | None = None,
        prefix: str = "_te",
):
    
    if isinstance(cat_cols,(str,bytes)):
        cat_cols = [cat_cols]
    
    train_out = train.copy()
    test_out = test.copy()
    y = train_out[y_col].values
    global_mean = y.mean()
    

    kf = KFold(n_splits=n_splits,shuffle = True, random_state = random_state)
    splits = list(kf.split(train_out))

    for col in cat_cols:
        oof_encoded = np.zeros(len(train_out))

        for tr_idx, val_idx in splits:
            tr = train_out.iloc[tr_idx]
            val = train_out.iloc[val_idx]
            stats = tr.groupby(col)[y_col].agg(["mean","count"])
            if smooth is not None:
                n = stats["count"]
                m = stats["mean"]
                stats["te"] = (n * m + smooth * global_mean) / (n + smooth)
            else:
                stats["te"] = stats["mean"]
            
            mapping = stats["te"]
            val_encoded = val[col].map(mapping)
            oof_encoded[val_idx] = val_encoded.values
        new_col = f"{col}{prefix}"
        train_out[new_col] = oof_encoded
   

        stats_full = train_out.groupby(col)[y_col].agg(["mean","count"])
        if smooth is not None:
            n = stats_full["count"]
            m = stats_full["mean"]
            stats_full["te"] = (n * m + smooth * global_mean) / (n + smooth)
        else:
            stats_full["te"] = stats_full["mean"]
        mapping_full = stats_full["te"]
        test_out[new_col] = test_out[col].map(mapping_full).fillna(global_mean)
    return train_out,test_out

In [4]:
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
cat_cols = df_train.select_dtypes(include=["object","category"]).columns.tolist()
num_cols = df_train.select_dtypes(include=["float64","int64"]).drop(columns="y").columns.tolist()
y_col = "y"
train_te,test_te = target_encode_oof(
    train=df_train,
    test=df_test,
    cat_cols=cat_cols,
    y_col=y_col,
    n_splits=5,
    random_state=42,
    smooth=20,   
    prefix="_te" 
)

In [None]:
te_cols = [c + "_te" for c in cat_cols]
feature_cols = te_cols + num_cols
X = train_te[feature_cols]
y = train_te["y"]
X_test = test_te[feature_cols]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_test_scaled  = scaler.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y)
y_pred_proba = model.predict_proba(X_test_scaled)[:,1]
y_pred = model.predict(X_test_scaled)



In [12]:
output = pd.DataFrame(
    {
        "index":range(len(y_pred_proba)),
        "proba":y_pred_proba,
    }
)
output.to_csv("out.csv", index=False, header=False)