Please add the original train/test datasets as values for the keys 'origin_tr' and 'origin_te' within the 'paths' variable.  
  
The prediction results for the test dataset will be saved in the directory as prediction.csv.

## 1. Dataset PreProcessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

paths = {
    'origin_tr': 'train.csv',
    'origin_te': 'test.csv'
}

train = pd.read_csv(paths['origin_tr'])
test  = pd.read_csv(paths['origin_te'])

top15

In [2]:
train_id = train["id"]
test_id = test["id"]

y = train["y"]
X = train.drop(columns=["id", "y", "shares"])
test_15 = test.drop(columns=["id"])  # id

#train의 median
train_medians = X.median(numeric_only=True)
X     = X.fillna(train_medians)
test_15 = test_15.fillna(train_medians)

X = pd.get_dummies(X) # 범주형 변수 원핫 인코딩
test_15 = pd.get_dummies(test_15)

test_15 = test_15.reindex(columns=X.columns, fill_value=0)

# 여기까지가 train/test_basic 데이터

# Top15 features 리스트 (XGBoost gain 기준)
top15_features = [
    'kw_avg_avg', 'data_channel_Entertainment', 'kw_max_avg', 'data_channel_Tech',
    'self_reference_min_shares', 'weekday_Saturday', 'self_reference_avg_sharess',
    'weekday_Sunday', 'data_channel_Social Media', 'kw_avg_max',
    'LDA_00', 'LDA_02', 'kw_min_avg', 'kw_avg_min', 'num_hrefs'
]

# 파생변수 정의
def add_derived_features(df):
    df = df.copy()
    df["keyword_strength_ratio"] = df["kw_max_avg"] / df["kw_avg_avg"].replace(0, 1e-9)
    df["img_token_ratio"] = df["num_imgs"] / df["n_unique_tokens"].replace(0, 1e-9)
    df["subjectivity_sentiment_mix"] = df["global_subjectivity"] * df["global_rate_positive_words"]
    return df

X_full = add_derived_features(X)
test_full = add_derived_features(test_15)

derived_features = ["keyword_strength_ratio", "img_token_ratio", "subjectivity_sentiment_mix"]
selected_features = top15_features + derived_features

train_top15 = X_full[selected_features].copy()
train_top15.insert(0, "id", train_id)
train_top15["y"] = y
test_top15 = test_full[selected_features].copy()
test_top15.insert(0, "id", test_id)

#train_top15 / test_top15

vif

In [3]:
df = train

df_1 = df.drop(columns=[
    'LDA_00','LDA_01','LDA_02','LDA_03',
    'rate_negative_words','n_non_stop_words',
    'self_reference_avg_sharess','n_unique_tokens',
    'kw_max_min','kw_avg_avg','y','shares','id'
])

# 결측치 처리
df_1['data_channel'] = df_1['data_channel'].fillna('Missing')
df_1['weekday']      = df_1['weekday'].fillna('Missing')

# one-hot encoding (data_channel)
df_one_hot = pd.get_dummies(
    df_1,
    columns=['data_channel'],
    prefix='channel',
    drop_first=True
)
channel_cols = [c for c in df_one_hot.columns if c.startswith('channel_')]


# LabelEncoder (weekday)
weekday_order = [
    'Monday','Tuesday','Wednesday',
    'Thursday','Friday','Saturday','Sunday','Missing'
]
le = LabelEncoder().fit(weekday_order)
df_one_hot['weekday_encoded'] = le.transform(df_one_hot['weekday']) + 1

# weekday 원본 컬럼 제거, float 타입 변환
df_2 = df_one_hot.drop(columns=['weekday']).astype(float)


log1p_clip_cols = [
    'n_tokens_content', 'num_hrefs', 'num_imgs', 'num_videos',
    'kw_min_min', 'kw_min_avg', 'kw_min_max',
    'kw_avg_min', 'kw_avg_avg', 'kw_avg_max',
    'kw_max_min', 'kw_max_avg', 'kw_max_max',
    'self_reference_min_shares', 'self_reference_max_shares'
]
binary_split_cols = [
    'n_non_stop_unique_tokens',
    'kw_min_min', 'kw_avg_min', 'kw_min_avg', 'num_videos'
]
log1p_clip_standard_cols = [
    'title_subjectivity', 'title_sentiment_polarity', 'abs_title_sentiment_polarity'
]
standard_only_cols = [
    'n_tokens_title', 'average_token_length', 'num_keywords', 'LDA_04',
    'global_subjectivity', 'global_sentiment_polarity', 'rate_positive_words',
    'avg_positive_polarity', 'min_positive_polarity', 'max_positive_polarity',
    'avg_negative_polarity', 'min_negative_polarity', 'max_negative_polarity',
    'abs_title_subjectivity'
]

# 2) df_proc 복사
df_proc = df_2.copy()

# 3) log1p 변환 + IQR 클리핑 (bounds 계산 및 저장)
iqr_bounds = {}
for col in log1p_clip_cols + log1p_clip_standard_cols:
    if col in df_proc.columns:
        # (a) shift 후 log1p
        min_val = df_proc[col].min()
        if min_val <= -1:
            df_proc[col] += abs(min_val) + 1.001
        df_proc[col] = np.log1p(df_proc[col])
        # (b) IQR bounds 계산
        q1 = df_proc[col].quantile(0.25)
        q3 = df_proc[col].quantile(0.75)
        iqr = q3 - q1
        lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        iqr_bounds[col] = (lower, upper)
        # (c) clip
        df_proc[col] = df_proc[col].clip(lower, upper)

# 4) binary split
for col in binary_split_cols:
    if col in df_proc.columns:
        df_proc[col + '_binary'] = (df_proc[col] > 0).astype(int)

# 5) standard scaling (fit & transform)
scaler = StandardScaler()
scale_targets = [
    c for c in df_proc.columns
    if c not in ['weekday_encoded'] + channel_cols
]
df_proc[scale_targets] = scaler.fit_transform(df_proc[scale_targets])

# 6) id, y 컬럼 복원
id_col = df['id'].reset_index(drop=True)
y_col  = df['y'].reset_index(drop=True)
df_proc.insert(0, 'id', id_col)
df_proc['y'] = y_col

train_vif = df_proc


df_raw = test
id_col = df_raw['id']

# 2) train과 동일한 컬럼 제거 & 결측치 처리
drop_cols = [
    'LDA_00','LDA_01','LDA_02','LDA_03',
    'rate_negative_words','n_non_stop_words',
    'self_reference_avg_sharess','n_unique_tokens',
    'kw_max_min','kw_avg_avg','y','shares','id'
]
df1 = df_raw.drop(columns=drop_cols, errors='ignore')
df1['data_channel'] = df1['data_channel'].fillna('Missing')
df1['weekday']      = df1['weekday'].fillna('Missing')

# 3) one-hot encoding + 없는 channel_cols 은 0으로 채우기
df1 = pd.get_dummies(df1,
                     columns=['data_channel'],
                     prefix='channel',
                     drop_first=True)
for c in channel_cols:   # train 에서 정의해 둔 리스트
    if c not in df1.columns:
        df1[c] = 0

# 4) weekday LabelEncoder 적용
df1['weekday_encoded'] = le.transform(df1['weekday']) + 1
df1.drop(columns=['weekday'], inplace=True)

# 5) float 변환
df2 = df1.astype(float)

# 6) log1p 변환 + IQR 클리핑
df_test = df2.copy()
for col in log1p_clip_cols + log1p_clip_standard_cols:
    if col in df_test.columns:
        # (a) 음수 shift 보정
        m = df_test[col].min()
        if m <= -1:
            df_test[col] += abs(m) + 1.001
        # (b) 로그 변환
        df_test[col] = np.log1p(df_test[col])
        # (c) train에서 계산해 둔 lower/upper 로 클리핑
        low, high = iqr_bounds[col]
        df_test[col] = df_test[col].clip(low, high)

# 7) binary split
for col in binary_split_cols:
    if col in df_test.columns:
        df_test[col + '_binary'] = (df_test[col] > 0).astype(int)

# 8) standard scaling
scale_targets = [c for c in df_test.columns
                 if c not in ['weekday_encoded'] + channel_cols]
df_test[scale_targets] = scaler.transform(df_test[scale_targets])

# 9) id 컬럼 맨 앞으로 복원
df_test.insert(0, 'id', id_col)

test_vif = df_test

ori+top15+vif

In [4]:
df_top15 = train_top15
df_vif = train_vif
df_origin = train

# Top15에서 VIF에는 없는 column
cols_to_add = [
    'kw_avg_avg',
    'self_reference_avg_sharess',
    'LDA_00',
    'LDA_02',
]

df_top15_sel = df_top15[['id'] + cols_to_add]

# VIF 세트에 병합
df = df_vif.merge(df_top15_sel, on='id', how='left')

# 파생변수 생성
df_origin['token_density']     = df_origin['n_tokens_content'] / (df_origin['num_keywords'] + 1)

lda_cols = ['LDA_00','LDA_01','LDA_02','LDA_03','LDA_04']
lda_probs = df_origin[lda_cols].clip(lower=1e-6)
df_origin['lda_entropy']       = -np.sum(lda_probs * np.log2(lda_probs), axis=1)

df_origin['sentiment_ratio']   = df_origin['global_rate_positive_words'] / (df_origin['global_rate_negative_words'] + 1e-5)
df_origin['emotion_contrast']  = np.abs(df_origin['avg_positive_polarity'] - df_origin['avg_negative_polarity'])

df_origin['kw_maxmax_per_keyword']         = df_origin['kw_max_max'] / (df_origin['num_keywords'] + 1e-5)
df_origin['unique_token_ratio']            = df_origin['n_non_stop_unique_tokens'] / (df_origin['n_tokens_content'] + 1e-5)
df_origin['ref_share_ratio']               = df_origin['self_reference_max_shares'] / (df_origin['self_reference_avg_sharess'] + 1e-5)
df_origin['share_per_token']               = df_origin['self_reference_avg_sharess'] / (df_origin['n_tokens_content'] + 1e-5)
df_origin['kw_variety_ratio']              = df_origin['kw_max_max'] / (df_origin['kw_min_min'] + 1e-5)
df_origin['positive_minus_negative']       = df_origin['avg_positive_polarity'] - df_origin['avg_negative_polarity']
df_origin['sentiment_weighted_subjectivity']= df_origin['global_sentiment_polarity'] * df_origin['global_subjectivity']

derived_cols = [
    'token_density',
    'lda_entropy',
    'sentiment_ratio',
    'emotion_contrast',
    'kw_maxmax_per_keyword',
    'unique_token_ratio',
    'ref_share_ratio',
    'share_per_token',
    'kw_variety_ratio',
    'positive_minus_negative',
    'sentiment_weighted_subjectivity'
]

df = df.merge(
    df_origin[['id'] + derived_cols],
    on='id',
    how='left'
)

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mean(), inplace=True)

df.drop(columns=['kw_max_max', 'positive_minus_negative', 'global_sentiment_polarity', 'emotion_contrast'], inplace=True)

df['kw_avg_avg_log'] = np.log1p(df['kw_avg_avg'])
q = df['kw_avg_avg_log'].quantile(0.99)
df['kw_avg_avg_log_clipped'] = df['kw_avg_avg_log'].clip(upper=q)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['kw_avg_avg_scaled'] = scaler.fit_transform(df[['kw_avg_avg_log_clipped']])

df['self_reference_avg_sharess_log'] = np.log1p(df['self_reference_avg_sharess'])
q99 = df['self_reference_avg_sharess_log'].quantile(0.99)
df['self_reference_avg_sharess_log_clipped'] = df['self_reference_avg_sharess_log'].clip(upper=q99)
df['self_reference_avg_sharess_scaled'] = scaler.fit_transform( df[['self_reference_avg_sharess_log_clipped']])

df['token_density_log'] = np.log1p(df['token_density'])
q99 = df['token_density_log'].quantile(0.99)
df['token_density_log_clip'] = df['token_density_log'].clip(upper=q99)
df['token_density_scaled'] = scaler.fit_transform( df[['token_density_log_clip']])

df['kw_maxmax_per_keyword_log'] = np.log1p(df['kw_maxmax_per_keyword'])
q99 = df['kw_maxmax_per_keyword_log'].quantile(0.99)
df['kw_maxmax_per_keyword_log_clipped'] = (
    df['kw_maxmax_per_keyword_log'].clip(upper=q99)
)
df['kw_maxmax_per_keyword_scaled'] = scaler.fit_transform(
    df[['kw_maxmax_per_keyword_log_clipped']]
)


df['unique_token_ratio_log'] = np.log1p(df['unique_token_ratio'])
q99 = df['unique_token_ratio_log'].quantile(0.99)
df['unique_token_ratio_log_clip'] = df['unique_token_ratio_log'].clip(upper=q99)
df['unique_token_ratio_scaled'] = scaler.fit_transform(
    df[['unique_token_ratio_log_clip']]
)

df['ref_share_ratio_log'] = np.log1p(df['ref_share_ratio'])
q99 = df['ref_share_ratio_log'].quantile(0.99)
df['ref_share_ratio_log_clip'] = df['ref_share_ratio_log'].clip(upper=q99)
df['ref_share_ratio_scaled'] = scaler.fit_transform(
    df[['ref_share_ratio_log_clip']]
)

# --- train 코드에서 로그 변환·클리핑·스케일링 하는 부분 바로 뒤에 ---
# 1) kw_avg_avg
q_kw   = df['kw_avg_avg_log'].quantile(0.99)
scaler_kw = StandardScaler().fit(df[['kw_avg_avg_log_clipped']])

# 2) self_reference_avg_sharess
q_sr   = df['self_reference_avg_sharess_log'].quantile(0.99)
scaler_sr = StandardScaler().fit(df[['self_reference_avg_sharess_log_clipped']])

# 3) token_density
q_td   = df['token_density_log'].quantile(0.99)
scaler_td = StandardScaler().fit(df[['token_density_log_clip']])

# 4) kw_maxmax_per_keyword
q_km   = df['kw_maxmax_per_keyword_log'].quantile(0.99)
scaler_km = StandardScaler().fit(df[['kw_maxmax_per_keyword_log_clipped']])

# 5) unique_token_ratio
q_ut   = df['unique_token_ratio_log'].quantile(0.99)
scaler_ut = StandardScaler().fit(df[['unique_token_ratio_log_clip']])

# 6) ref_share_ratio
q_rf   = df['ref_share_ratio_log'].quantile(0.99)
scaler_rf = StandardScaler().fit(df[['ref_share_ratio_log_clip']])

train_final = df

df_t_origin = test
df_t_top15  = test_top15
df_t_vif    = test_vif


cols_to_add = [
    'kw_avg_avg',
    'self_reference_avg_sharess',
    'LDA_00',
    'LDA_02',
]
df_t_top15_sel = df_t_top15[['id'] + cols_to_add]
df_test = df_t_vif.merge(df_t_top15_sel, on='id', how='left')

# 3. 파생변수 생성 (train과 동일)
df_t_origin['token_density'] = df_t_origin['n_tokens_content'] / (df_t_origin['num_keywords'] + 1)
lda_cols = ['LDA_00','LDA_01','LDA_02','LDA_03','LDA_04']
lda_probs = df_t_origin[lda_cols].clip(lower=1e-6)
df_t_origin['lda_entropy'] = -np.sum(lda_probs * np.log2(lda_probs), axis=1)
df_t_origin['sentiment_ratio']    = df_t_origin['global_rate_positive_words'] / (df_t_origin['global_rate_negative_words'] + 1e-5)
df_t_origin['emotion_contrast']   = np.abs(df_t_origin['avg_positive_polarity'] - df_t_origin['avg_negative_polarity'])
df_t_origin['kw_maxmax_per_keyword']        = df_t_origin['kw_max_max'] / (df_t_origin['num_keywords'] + 1e-5)
df_t_origin['unique_token_ratio']           = df_t_origin['n_non_stop_unique_tokens'] / (df_t_origin['n_tokens_content'] + 1e-5)
df_t_origin['ref_share_ratio']              = df_t_origin['self_reference_max_shares'] / (df_t_origin['self_reference_avg_sharess'] + 1e-5)
df_t_origin['share_per_token']              = df_t_origin['self_reference_avg_sharess'] / (df_t_origin['n_tokens_content'] + 1e-5)
df_t_origin['kw_variety_ratio']             = df_t_origin['kw_max_max'] / (df_t_origin['kw_min_min'] + 1e-5)
df_t_origin['positive_minus_negative']      = df_t_origin['avg_positive_polarity'] - df_t_origin['avg_negative_polarity']
df_t_origin['sentiment_weighted_subjectivity'] = df_t_origin['global_sentiment_polarity'] * df_t_origin['global_subjectivity']

derived_cols = [
    'token_density',
    'lda_entropy',
    'sentiment_ratio',
    'emotion_contrast',
    'kw_maxmax_per_keyword',
    'unique_token_ratio',
    'ref_share_ratio',
    'share_per_token',
    'kw_variety_ratio',
    'positive_minus_negative',
    'sentiment_weighted_subjectivity'
]

df_test = df_test.merge(
    df_t_origin[['id'] + derived_cols],
    on='id',
    how='left'
)

# 4. 결측치·무한대 처리
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.fillna(df.mean(), inplace=True)  # train 데이터 평균으로 채우기

df_test.drop(columns=[
    'kw_max_max',
    'positive_minus_negative',
    'global_sentiment_polarity',
    'emotion_contrast'
], inplace=True)

# 5. 로그 변환 → 클리핑 → 스케일링 (train과 똑같이)
# (train 코드에서 이미 계산된 q_kw, q_sr, q_td, q_km, q_ut, q_rf 변수를 그대로 사용)
# 5-1) kw_avg_avg
df_test['kw_avg_avg_log'] = np.log1p(df_test['kw_avg_avg'])
df_test['kw_avg_avg_log_clipped'] = df_test['kw_avg_avg_log'].clip(upper=q_kw)
df_test['kw_avg_avg_scaled'] = scaler_kw.transform(df_test[['kw_avg_avg_log_clipped']])

# 5-2) self_reference_avg_sharess
df_test['self_reference_avg_sharess_log'] = np.log1p(df_test['self_reference_avg_sharess'])
df_test['self_reference_avg_sharess_log_clipped'] = df_test['self_reference_avg_sharess_log'].clip(upper=q_sr)
df_test['self_reference_avg_sharess_scaled'] = scaler_sr.transform(df_test[['self_reference_avg_sharess_log_clipped']])

# 5-3) token_density
df_test['token_density_log'] = np.log1p(df_test['token_density'])
df_test['token_density_log_clip'] = df_test['token_density_log'].clip(upper=q_td)
df_test['token_density_scaled'] = scaler_td.transform(df_test[['token_density_log_clip']])

# 5-4) kw_maxmax_per_keyword
df_test['kw_maxmax_per_keyword_log'] = np.log1p(df_test['kw_maxmax_per_keyword'])
df_test['kw_maxmax_per_keyword_log_clipped'] = df_test['kw_maxmax_per_keyword_log'].clip(upper=q_km)
df_test['kw_maxmax_per_keyword_scaled'] = scaler_km.transform(df_test[['kw_maxmax_per_keyword_log_clipped']])

# 5-5) unique_token_ratio
df_test['unique_token_ratio_log'] = np.log1p(df_test['unique_token_ratio'])
df_test['unique_token_ratio_log_clip'] = df_test['unique_token_ratio_log'].clip(upper=q_ut)
df_test['unique_token_ratio_scaled'] = scaler_ut.transform(df_test[['unique_token_ratio_log_clip']])

# 5-6) ref_share_ratio
df_test['ref_share_ratio_log'] = np.log1p(df_test['ref_share_ratio'])
df_test['ref_share_ratio_log_clip'] = df_test['ref_share_ratio_log'].clip(upper=q_rf)
df_test['ref_share_ratio_scaled'] = scaler_rf.transform(df_test[['ref_share_ratio_log_clip']])

test_final = df_test
test_final["id"] = test["id"]

## 2. Model

In [None]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import warnings
import os

warnings.filterwarnings('ignore')

# --- Optimal hyperparameters from previous GridSearch (for the L2 regularized model) ---
tuned_stacking_params = {
    'rf__n_estimators': 100,
    'rf__max_depth': 10,
    'xgb__n_estimators': 100,
    'xgb__learning_rate': 0.05,
    'lgbm__n_estimators': 100,
    'lgbm__num_leaves': 31,
    'catb__iterations': 100,
    'catb__depth': 4,
    'final_estimator__C': 0.1,
    'passthrough': False  # This was the setting for fin1
}

# --- L2 regularization settings for the chosen final model ---
L2_XGB_LAMBDA = 1.0
L2_LGBM_LAMBDA = 0.1
L2_CATB_LEAF_REG = 3.0

# --- Assume train_final and test_final DataFrames are already loaded and preprocessed ---
# Example placeholders (replace with your actual data loading and preprocessing):
# Ensure 'train_final' has the target column 'y' and features.
# Ensure 'test_final' has an 'id' column and features.

# --- Placeholder: Create dummy DataFrames for demonstration ---
# Remove or replace this section with your actual data loading
train_final = train_final.drop(columns=["id"])
n_features = len(train_final.columns)-1
# --- End of Placeholder ---


# --- Prepare X_train, y_train from train_final DataFrame ---
if 'y' not in train_final.columns:
    raise ValueError("Target column 'y' not found in train_final DataFrame.")
y_train = train_final['y']
X_train = train_final.drop(columns=['y'])

# --- Prepare X_test and test_ids from test_final DataFrame ---
if 'id' not in test_final.columns:
    raise ValueError("'id' column not found in test_final DataFrame.")
test_ids = test_final['id']
# Assuming all other columns in test_final are features and match X_train's features
# If 'id' is not the only non-feature column, drop others as well.
X_test = test_final.drop(columns=['id'])


# --- Ensure feature consistency (columns and order) ---
# It's crucial that X_train and X_test have the same features in the same order.
# If your preprocessing steps guarantee this, you might not need explicit reordering.
# However, for safety, you can align columns:
common_features = X_train.columns.intersection(X_test.columns)
if len(common_features) != X_train.shape[1] or len(common_features) != X_test.shape[1]:
    print("Warning: Feature mismatch between train and test. Using common features only.")
    # This part might need adjustment based on how you handle feature differences
    # For now, let's assume they should match from your preprocessing
    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError(f"Feature count mismatch: X_train has {X_train.shape[1]} features, X_test has {X_test.shape[1]} features after dropping 'id'. Ensure they match.")


X_train = X_train[common_features]
X_test = X_test[common_features]


# --- Define the final L2 regularized Stacking model ---
print("Defining the final L2 regularized Stacking model...")
rf_final_l2 = RandomForestClassifier(
    n_estimators=tuned_stacking_params['rf__n_estimators'],
    max_depth=tuned_stacking_params['rf__max_depth'],
    random_state=42,
    class_weight='balanced' # Retained from original script context
)
xgb_final_l2 = XGBClassifier(
    n_estimators=tuned_stacking_params['xgb__n_estimators'],
    learning_rate=tuned_stacking_params['xgb__learning_rate'],
    random_state=42,
    use_label_encoder=False, # Retained
    eval_metric='logloss',   # Retained
    reg_lambda=L2_XGB_LAMBDA # L2 regularization
)
lgbm_final_l2 = LGBMClassifier(
    n_estimators=tuned_stacking_params['lgbm__n_estimators'],
    num_leaves=tuned_stacking_params['lgbm__num_leaves'],
    random_state=42,
    class_weight='balanced', # Retained
    verbosity=-1,            # Retained
    reg_lambda=L2_LGBM_LAMBDA # L2 regularization
)
catb_final_l2 = CatBoostClassifier(
    iterations=tuned_stacking_params['catb__iterations'],
    depth=tuned_stacking_params['catb__depth'],
    random_seed=42,          # Retained
    verbose=0,               # Retained
    l2_leaf_reg=L2_CATB_LEAF_REG # L2 regularization
)
lr_final_l2 = LogisticRegression(
    C=tuned_stacking_params['final_estimator__C'],
    solver='liblinear',      # Retained
    random_state=42,
    class_weight='balanced', # Retained
    max_iter=1000,           # Retained
    penalty='l2' # Ensure final estimator also uses L2 if desired (already implied by C for LogisticRegression)
)

stacking_cv_splitter = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

final_stacking_model_l2 = StackingClassifier(
    estimators=[('rf', rf_final_l2), ('xgb', xgb_final_l2), ('lgbm', lgbm_final_l2), ('catb', catb_final_l2)],
    final_estimator=lr_final_l2,
    passthrough=tuned_stacking_params['passthrough'],
    cv=stacking_cv_splitter
)

# --- Train the final model on the entire train_final dataset ---
print(f"Training the final L2 regularized Stacking model on train_final (X shape: {X_train.shape}, y shape: {y_train.shape})...")
final_stacking_model_l2.fit(X_train, y_train)
print("Training complete.")

# --- Make predictions on the test_final dataset ---
print(f"Predicting on test_final (X shape: {X_test.shape})...")
test_pred_proba = final_stacking_model_l2.predict_proba(X_test)[:, 1] # Probability of the positive class
custom_threshold = 0.47
test_pred_y = (test_pred_proba >= custom_threshold).astype(int) #astype(int) converts boolean to 0 or 1
print("Prediction complete.")

# --- Create a DataFrame for the results ---
results_df = pd.DataFrame({
    'id': test_ids,
    'y_predict': test_pred_y,
    'y_prob': test_pred_proba
})

# --- Save the results to a CSV file ---
output_filename = 'prediction.csv'
results_df.to_csv(output_filename, index=False)
print(f"Results saved to {output_filename}")

print("\nScript finished.")

Defining the final L2 regularized Stacking model...
Training the final L2 regularized Stacking model on train_final (X shape: (22200, 75), y shape: (22200,))...
Training complete.
Predicting on test_final (X shape: (9515, 75))...
Prediction complete.
Results saved to prediction.csv

Script finished.
