In [1]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [2]:
%%time
# 1. 数据加载
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./testA_data.csv')
submit = test_df[['did']]

print(f"训练集大小: {train_df.shape}")
print(f"测试集大小: {test_df.shape}")

# 检查训练集和测试集中的did是否有重叠
train_dids = set(train_df['did'])
test_dids = set(test_df['did'])
common_dids = train_dids & test_dids
print(f"训练集和测试集中重叠的did数量: {len(common_dids)}")
print(f"重叠比例: {len(common_dids)/len(test_dids):.4f}")

# 2. 时间特征工程
for df in [train_df, test_df]:
    # 转换为时间戳
    df['ts'] = pd.to_datetime(df['common_ts'], unit='ms')
    
    # 提取时间特征
    df['day'] = df['ts'].dt.day
    df['dayofweek'] = df['ts'].dt.dayofweek
    df['hour'] = df['ts'].dt.hour


训练集大小: (3429925, 15)
测试集大小: (1143309, 14)
训练集和测试集中重叠的did数量: 192393
重叠比例: 0.9324
CPU times: total: 7.75 s
Wall time: 7.75 s


In [3]:
%%time
# 3. RFM 特征工程 - 修复数据泄露问题
print("修复数据泄露：仅使用训练集计算RFM特征...")

# 3.1 仅在训练集上计算RFM特征
max_ts_train = train_df['ts'].max()
rfm_agg_train = train_df.groupby('did').agg({
    'ts': lambda x: (max_ts_train - x.max()).days,  # Recency
    'eid': 'count',  # Frequency
    'mid': 'nunique',  # 行为深度
    'common_ts': ['min', 'max']  # 首次和末次行为时间
})

# 3.2 列名扁平化处理
rfm_agg_train.columns = ['_'.join(col).strip() for col in rfm_agg_train.columns.values]
rfm_agg_train = rfm_agg_train.reset_index()

# 3.3 特征重命名
rfm_agg_train.rename(columns={
    'ts_<lambda>': 'recency',
    'eid_count': 'frequency',
    'mid_nunique': 'mid_nunique',
    'common_ts_min': 'first_action_ts',
    'common_ts_max': 'last_action_ts'
}, inplace=True)

# 3.4 派生新特征
rfm_agg_train['action_timespan_seconds'] = (rfm_agg_train['last_action_ts'] - rfm_agg_train['first_action_ts']) / 1000

# 3.5 合并RFM特征
# 训练集：直接合并
train_df = pd.merge(train_df, rfm_agg_train, on='did', how='left')

# 测试集：只对在训练集中出现过的did合并特征，新did的RFM特征设为默认值
test_df = pd.merge(test_df, rfm_agg_train, on='did', how='left')

# 对于测试集中新的did，填充RFM特征的默认值
rfm_features = ['recency', 'frequency', 'mid_nunique', 'first_action_ts', 'last_action_ts', 'action_timespan_seconds']
for feature in rfm_features:
    if feature in ['recency']:
        test_df[feature].fillna(train_df[feature].median(), inplace=True)
    elif feature in ['frequency', 'mid_nunique']:
        test_df[feature].fillna(train_df[feature].median(), inplace=True)
    else:
        test_df[feature].fillna(train_df[feature].median(), inplace=True)

print(f"训练集中RFM特征缺失值情况:")
print(train_df[rfm_features].isnull().sum())
print(f"\n测试集中RFM特征缺失值情况:")
print(test_df[rfm_features].isnull().sum())

# 3.6 清理不再需要的 ts 列
for df in [train_df, test_df]:
    df.drop(['ts'], axis=1, inplace=True)


修复数据泄露：仅使用训练集计算RFM特征...
训练集中RFM特征缺失值情况:
recency                    0
frequency                  0
mid_nunique                0
first_action_ts            0
last_action_ts             0
action_timespan_seconds    0
dtype: int64

测试集中RFM特征缺失值情况:
recency                    0
frequency                  0
mid_nunique                0
first_action_ts            0
last_action_ts             0
action_timespan_seconds    0
dtype: int64
CPU times: total: 15.2 s
Wall time: 15.3 s


In [11]:
print("每一列的数据类型:")
print(train_df.dtypes)

每一列的数据类型:
mid                          int64
eid                          int64
did                         object
device_brand                 int32
ntt                          int32
operator                     int32
common_country               int32
common_province              int32
common_city                  int32
appver                       int32
channel                      int32
common_ts                    int64
os_type                      int32
udmap                        int32
is_new_did                   int64
day                          int32
dayofweek                    int32
hour                         int32
recency                      int64
frequency                    int64
mid_nunique                  int64
first_action_ts              int64
last_action_ts               int64
action_timespan_seconds    float64
dtype: object


In [4]:
%%time
# 4. 类别特征编码
cat_features = [
    'device_brand', 'ntt', 'operator', 'common_country',
    'common_province', 'common_city', 'appver', 'channel',
    'os_type', 'udmap'
]

label_encoders = {}

for feature in cat_features:
    le = LabelEncoder()
    
    # 合并训练集和测试集的所有类别
    all_values = pd.concat([train_df[feature], test_df[feature]]).astype(str)
    
    le.fit(all_values)
    label_encoders[feature] = le
    
    # 应用编码
    train_df[feature] = le.transform(train_df[feature].astype(str))
    test_df[feature] = le.transform(test_df[feature].astype(str))


CPU times: total: 17.6 s
Wall time: 17.7 s


In [5]:
%%time
# 5. 特征准备 - 注意：did不包含在特征中！
features = [
    # 原始特征
    'mid', 'eid', 'device_brand', 'ntt', 'operator', 
    'common_country', 'common_province', 'common_city',
    'appver', 'channel', 'os_type', 'udmap',
    # 时间特征
    'hour', 'dayofweek', 'day', 'common_ts',
    # RFM特征（基于训练集计算，避免数据泄露）
    'recency', 'frequency', 'mid_nunique', 'first_action_ts', 'last_action_ts', 'action_timespan_seconds'
]

print(f"使用的特征数量: {len(features)}")
print(f"特征列表: {features}")
print(f"\n重要提示: did未被用作特征，RFM特征仅基于训练集计算")

# 准备训练和测试数据
X_train = train_df[features]
y_train = train_df['is_new_did']
X_test = test_df[features]


使用的特征数量: 22
特征列表: ['mid', 'eid', 'device_brand', 'ntt', 'operator', 'common_country', 'common_province', 'common_city', 'appver', 'channel', 'os_type', 'udmap', 'hour', 'dayofweek', 'day', 'common_ts', 'recency', 'frequency', 'mid_nunique', 'first_action_ts', 'last_action_ts', 'action_timespan_seconds']

重要提示: did未被用作特征，RFM特征仅基于训练集计算
CPU times: total: 188 ms
Wall time: 178 ms


In [6]:
# F1阈值优化函数
def find_optimal_threshold(y_true, y_pred_proba):
    """寻找最大化F1分数的阈值"""
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in [0.1,0.15,0.2,0.25,0.3,0.35,0.4]:
        y_pred = (y_pred_proba >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

# 模型参数
optimal_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbose': -1,
    'n_jobs': 8,
    'seed': 42,
    'max_depth': 15,
    'num_leaves': 255,
    'learning_rate': 0.1,
    'min_child_samples': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 3
}


In [7]:
%%time
# 模型训练（修复数据泄露后）
print("开始训练修复数据泄露后的模型...")

n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
fold_thresholds = []
fold_f1_scores = []
models = []
oof_preds = np.zeros(len(X_train))
oof_probas = np.zeros(len(X_train))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n======= Fold {fold+1}/{n_folds} =======")
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # 创建数据集
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    # 模型训练
    model = lgb.train(
        optimal_params, train_set,
        num_boost_round=1000,
        valid_sets=[train_set, val_set],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )
    models.append(model)
    
    # 验证集预测
    val_pred_proba = model.predict(X_val)
    oof_probas[val_idx] = val_pred_proba
    
    # 阈值优化
    best_threshold, best_f1 = find_optimal_threshold(y_val, val_pred_proba)
    fold_thresholds.append(best_threshold)
    
    # 使用优化阈值计算F1
    val_pred_labels = (val_pred_proba >= best_threshold).astype(int)
    fold_f1 = f1_score(y_val, val_pred_labels)
    fold_f1_scores.append(fold_f1)
    oof_preds[val_idx] = val_pred_labels
    
    print(f"Fold {fold+1} Optimal Threshold: {best_threshold:.4f}")
    print(f"Fold {fold+1} F1 Score: {fold_f1:.5f}")
    
    # 测试集预测
    test_preds += model.predict(X_test) / n_folds


开始训练修复数据泄露后的模型...

[100]	training's binary_logloss: 0.133373	valid_1's binary_logloss: 0.13612
[200]	training's binary_logloss: 0.111356	valid_1's binary_logloss: 0.115919
[300]	training's binary_logloss: 0.0978509	valid_1's binary_logloss: 0.103994
[400]	training's binary_logloss: 0.0870139	valid_1's binary_logloss: 0.0944971
[500]	training's binary_logloss: 0.0777062	valid_1's binary_logloss: 0.086248
[600]	training's binary_logloss: 0.0706348	valid_1's binary_logloss: 0.0802052
[700]	training's binary_logloss: 0.0641976	valid_1's binary_logloss: 0.0746866
[800]	training's binary_logloss: 0.0585294	valid_1's binary_logloss: 0.0699452
[900]	training's binary_logloss: 0.0536773	valid_1's binary_logloss: 0.0659131
[1000]	training's binary_logloss: 0.0488009	valid_1's binary_logloss: 0.0616269
Fold 1 Optimal Threshold: 0.4000
Fold 1 F1 Score: 0.93618

[100]	training's binary_logloss: 0.132954	valid_1's binary_logloss: 0.135626
[200]	training's binary_logloss: 0.111458	valid_1's binary_lo

In [8]:
# 整体结果评估
avg_threshold = np.mean(fold_thresholds)
final_oof_preds = (oof_probas >= avg_threshold).astype(int)
final_f1 = f1_score(y_train, final_oof_preds)

print("\n===== 修复数据泄露后的最终结果 =====")
print(f"Average Optimal Threshold: {avg_threshold:.4f}")
print(f"Fold F1 Scores: {[f'{s:.5f}' for s in fold_f1_scores]}")
print(f"Average Fold F1: {np.mean(fold_f1_scores):.5f}")
print(f"OOF F1 Score: {final_f1:.5f}")

# 测试集预测与提交文件生成
test_pred_labels = (test_preds >= avg_threshold).astype(int)
submit['is_new_did'] = test_pred_labels

# 保存提交文件
submit[['is_new_did']].to_csv('submit_fixed.csv', index=False)
print("\n修复数据泄露后的提交文件已保存: submit_fixed.csv")
print(f"预测新用户比例: {test_pred_labels.mean():.4f}")
print(f"测试集大小: {len(test_pred_labels)}")

print("\n===== 数据泄露修复总结 =====")
print("1. ✅ did未被用作特征")
print("2. ✅ RFM特征仅基于训练集计算")
print("3. ✅ 测试集中新did的RFM特征用训练集统计值填充")
print("4. ✅ 避免了future information leakage")



===== 修复数据泄露后的最终结果 =====
Average Optimal Threshold: 0.4000
Fold F1 Scores: ['0.93618', '0.93472', '0.93644', '0.93466', '0.93542']
Average Fold F1: 0.93549
OOF F1 Score: 0.93549

修复数据泄露后的提交文件已保存: submit_fixed.csv
预测新用户比例: 0.1565
测试集大小: 1143309

===== 数据泄露修复总结 =====
1. ✅ did未被用作特征
2. ✅ RFM特征仅基于训练集计算
3. ✅ 测试集中新did的RFM特征用训练集统计值填充
4. ✅ 避免了future information leakage
