In [12]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
import os
import gc

In [13]:
print("--- Loading Data ---")

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
datasert_df = pd.read_csv('personality_dataset.csv')#已经用mean填充了数值,mode填充了分类型数据

print("--- Engineering 'match_p' feature using merge ---")

datasert_df_prep = datasert_df.rename(columns={'Personality': 'match_p'}).drop_duplicates(['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance','Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency'])
merge_cols = ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance','Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']
train_df = train_df.merge(datasert_df_prep, how='left', on=merge_cols)
test_df = test_df.merge(datasert_df_prep, how='left', on=merge_cols)

--- Loading Data ---
--- Engineering 'match_p' feature using merge ---


In [14]:
print("--- Performing Imputation and Initial Feature Prep ---")
train_id = train_df['id']; test_id = test_df['id']
y_train_series = train_df['Personality']
all_data = pd.concat([train_df.drop(['id', 'Personality'], axis=1), test_df.drop('id', axis=1)], ignore_index=True)

--- Performing Imputation and Initial Feature Prep ---


In [15]:
def fill_missing_by_quantile_group(df, group_source_col, target_col):
    """
    按分位数分组，使用组内中位数填充目标列的缺失值。

    该函数实现了一种智能的缺失值填充方法。它首先根据 `group_source_col` 列的
    数值分布(分位数),将数据分成四组(0-25%, 25-50%, 50-75%, 75-100%).然后，
    计算每个组内 `target_col` 列的中位数。最后，使用各行对应分组的中位数来填充
    `target_col` 中的缺失值(NaN)。

    这种方法比使用全局中位数填充更精细，特别适用于源列和目标列之间存在相关性的场景
    （例如，使用年龄分组来填充收入的缺失值）。

    参数:
        df (pd.DataFrame): 待处理的Pandas DataFrame。
        group_source_col (str): 用于分箱和分组的源列名。此列应该是数值类型，以便计算分位数。
        target_col (str): 包含缺失值(NaN)并需要被填充的目标列名。

    返回值:
        pd.DataFrame: 填充了缺失值后的DataFrame。
    """
        
    temp_bin_col = f'{group_source_col}_bin'
    df[temp_bin_col] = pd.qcut(df[group_source_col], 
                        q=[0, 0.25, 0.5, 0.75, 1.0], 
                        labels=[1,2,3,4], 
                        duplicates='drop')
    df[target_col] = df[target_col].fillna(df.groupby(temp_bin_col)[target_col].transform('median'))
    df.drop(columns=[temp_bin_col], inplace=True)
    return df

In [16]:
#认为Social_event_attendance和Time_spend_Alone有关，用前者分箱为后者取中位数缺失值
#认为Going_outside和Time_spend_Alone有关，用前者分箱为后者取中位数填充缺失值
#再用全部的中位数fillna还没有被补全的Time_spend_alone
all_data = fill_missing_by_quantile_group(all_data, 'Social_event_attendance', 'Time_spent_Alone')
all_data = fill_missing_by_quantile_group(all_data, 'Going_outside', 'Time_spent_Alone')
all_data['Time_spent_Alone'].fillna(all_data['Time_spent_Alone'].median(), inplace=True)

all_data = fill_missing_by_quantile_group(all_data, 'Going_outside', 'Social_event_attendance')
all_data = fill_missing_by_quantile_group(all_data, 'Friends_circle_size', 'Social_event_attendance')
all_data['Social_event_attendance'].fillna(all_data['Social_event_attendance'].median(), inplace=True)

all_data = fill_missing_by_quantile_group(all_data, 'Social_event_attendance', 'Going_outside')
all_data['Going_outside'].fillna(all_data['Going_outside'].median(), inplace=True)

all_data = fill_missing_by_quantile_group(all_data, 'Post_frequency', 'Friends_circle_size')
all_data = fill_missing_by_quantile_group(all_data, 'Going_outside', 'Friends_circle_size')
all_data['Friends_circle_size'].fillna(all_data['Friends_circle_size'].median(), inplace=True)

all_data = fill_missing_by_quantile_group(all_data, 'Friends_circle_size', 'Post_frequency')
all_data['Post_frequency'].fillna(all_data['Post_frequency'].median(), inplace=True)

In [17]:
numerical_features = all_data.select_dtypes(include=np.number).columns.tolist()

In [18]:
print("--- Creating Final Advanced Features ---")
all_data['social_mean'] = all_data[numerical_features].mean(axis=1)
all_data['social_std'] = all_data[numerical_features].std(axis=1)
all_data['social_sum'] = all_data[numerical_features].sum(axis=1)

--- Creating Final Advanced Features ---


In [19]:
# kmeans方法获得聚类特征
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
all_data['cluster'] = kmeans.fit_predict(all_data[numerical_features])

In [20]:
# 获得多项式特征
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False) # interaction_only=False gives x*y AND x^2
# 同时生成多项式特征（如 a^2, b^2）和交互特征（如 a*b）；True只生成交互特征
poly_features = poly.fit_transform(all_data[numerical_features])
poly_df = pd.DataFrame(poly_features, 
                        columns=poly.get_feature_names_out(numerical_features))
poly_df.drop(columns=numerical_features, inplace=True)
all_data = pd.concat([all_data.reset_index(drop=True), poly_df], axis=1)

In [21]:
# 填充分类型缺失值，用Unknown填充
all_data.fillna({'Stage_fear': 'Unknown', 'Drained_after_socializing': 'Unknown', 'match_p': 'Unknown'}, inplace=True)
all_data = pd.get_dummies(all_data, 
                            columns=['Stage_fear', 'Drained_after_socializing', 'match_p', 'cluster'], 
                            prefix=['Stage', 'Drained', 'match', 'cluster'])


X = all_data[:len(train_df)]
X_test = all_data[len(train_df):]

y_encoded = LabelEncoder().fit_transform(y_train_series)

In [22]:
print("\n--- Training Final Ensemble with New Advanced Features ---")
xgb_params = { 'objective': 'binary:logistic', 
                'tree_method': 'gpu_hist', 
                'random_state': 42, 
                'n_estimators': 1000, 
                'learning_rate': 0.01, 
                'max_depth': 7, 
                'subsample': 0.7, 
                'colsample_bytree': 0.7 }

lgb_params = { 'objective': 'binary', 
                'metric': 'accuracy', 
                'n_estimators': 1000, 
                'random_state': 42, 
                'verbose': -1, 
                'learning_rate': 0.025, 
                'num_leaves': 150 }

cat_params = { 'objective': 'Logloss', 
                'iterations': 1000, 
                'random_seed': 42, 
                'verbose': 0, 
                'learning_rate': 0.0212, 
                'depth': 10 }

models = {'xgb': xgb.XGBClassifier(**xgb_params), 
            'lgb': LGBMClassifier(**lgb_params), 
            'cat': CatBoostClassifier(**cat_params)}

#折外观测 Out-of-Fold
oof_preds = {name: np.zeros(len(X)) for name in models.keys()}
test_preds = {name: np.zeros(len(X_test)) for name in models.keys()}

N_SPLITS = 10

kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_encoded)):
    print(f"--- FOLD {fold+1}/{N_SPLITS} ---")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    for name, model in models.items():
        print(f"  Training {name}...")
        model.fit(X_train, y_train)
        oof_preds[name][val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds[name] += model.predict_proba(X_test)[:, 1] / N_SPLITS
    gc.collect()


--- Training Final Ensemble with New Advanced Features ---
--- FOLD 1/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 2/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 3/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 4/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 5/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 6/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 7/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 8/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 9/10 ---
  Training xgb...
  Training lgb...
  Training cat...
--- FOLD 10/10 ---
  Training xgb...
  Training lgb...
  Training cat...


In [24]:
print("\n--- Stacking and Creating Final Submission ---")
oof_df = pd.DataFrame(oof_preds); test_preds_df = pd.DataFrame(test_preds)
meta_model = LogisticRegression(random_state=42); meta_model.fit(oof_df, y_encoded)
final_predictions = meta_model.predict(test_preds_df)
le_final = LabelEncoder().fit(['Extrovert', 'Introvert']); final_labels = le_final.inverse_transform(final_predictions)
submission_df = pd.DataFrame({'id': test_id, 'Personality': final_labels})
submission_df.to_csv('submission_final_advanced_feats_v2.csv', index=False)
print("\n✅ Final submission file 'submission_final_advanced_feats_v2.csv' created!")


--- Stacking and Creating Final Submission ---

✅ Final submission file 'submission_final_advanced_feats_v2.csv' created!
