In [6]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("增强特征工程版本 - 深度挖掘所有字段！")


增强特征工程版本 - 深度挖掘所有字段！


In [7]:
%%time
# 1. 数据加载和基础分析
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./testA_data.csv')
submit = test_df[['did']]

print(f"训练集大小: {train_df.shape}")
print(f"测试集大小: {test_df.shape}")

# 检查udmap字段（验证是否为空）
print(f"\n=== udmap 字段分析 ===")
def check_udmap(df, name):
    empty_count = 0
    valid_count = 0
    for udmap in df['udmap'].dropna():
        try:
            parsed = json.loads(udmap)
            if len(parsed) == 0:
                empty_count += 1
            else:
                valid_count += 1
        except:
            pass
    print(f"{name} - 空JSON: {empty_count}, 非空JSON: {valid_count}")

check_udmap(train_df, "训练集")
check_udmap(test_df, "测试集")

# 检查did重叠情况
train_dids = set(train_df['did'])
test_dids = set(test_df['did'])
common_dids = train_dids & test_dids
print(f"\nDID重叠情况:")
print(f"训练集和测试集中重叠的did数量: {len(common_dids)}")
print(f"重叠比例: {len(common_dids)/len(test_dids):.4f}")

# 目标变量分布
print(f"\n=== 目标变量分布 ===")
print(f"新用户比例: {train_df['is_new_did'].mean():.4f}")
print(f"新用户数量: {train_df['is_new_did'].sum()}")
print(f"老用户数量: {(1-train_df['is_new_did']).sum()}")


训练集大小: (3429925, 15)
测试集大小: (1143309, 14)

=== udmap 字段分析 ===
训练集 - 空JSON: 3162776, 非空JSON: 0
测试集 - 空JSON: 1054255, 非空JSON: 0

DID重叠情况:
训练集和测试集中重叠的did数量: 192393
重叠比例: 0.9324

=== 目标变量分布 ===
新用户比例: 0.1560
新用户数量: 535185
老用户数量: 2894740
CPU times: total: 11.3 s
Wall time: 11.4 s


In [8]:
%%time
# 2. 增强时间特征工程
print("=== 增强时间特征工程 ===")

for df in [train_df, test_df]:
    # 基础时间特征
    df['ts'] = pd.to_datetime(df['common_ts'], unit='ms')
    df['day'] = df['ts'].dt.day
    df['dayofweek'] = df['ts'].dt.dayofweek
    df['hour'] = df['ts'].dt.hour
    df['minute'] = df['ts'].dt.minute
    
    # 更细粒度的时间特征
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_workday'] = (df['dayofweek'] < 5).astype(int)
    
    # 时间段特征
    df['time_period'] = pd.cut(df['hour'], 
                              bins=[0, 6, 12, 18, 24], 
                              labels=['凌晨', '上午', '下午', '晚上'],
                              include_lowest=True)
    
    # 小时分组（更粗粒度）
    df['hour_group'] = pd.cut(df['hour'], 
                             bins=[0, 8, 16, 24], 
                             labels=['夜间', '白天', '傍晚'],
                             include_lowest=True)
    
    # 分钟分组
    df['minute_group'] = pd.cut(df['minute'], 
                               bins=[0, 15, 30, 45, 60], 
                               labels=['0-15', '15-30', '30-45', '45-60'],
                               include_lowest=True)

print("时间特征创建完成！")


=== 增强时间特征工程 ===
时间特征创建完成！
CPU times: total: 859 ms
Wall time: 888 ms


In [9]:
%%time
# 3. 增强RFM特征工程（仅基于训练集，避免数据泄露）
print("=== 增强RFM特征工程 ===")

# 基于训练集计算RFM特征
max_ts_train = train_df['ts'].max()
min_ts_train = train_df['ts'].min()

# 基础RFM特征
rfm_basic = train_df.groupby('did').agg({
    'ts': [
        lambda x: (max_ts_train - x.max()).days,  # Recency
        lambda x: (x.max() - min_ts_train).days,  # 用户存在时间
    ],
    'eid': ['count', 'nunique'],  # Frequency: 总行为次数和不同行为数
    'mid': ['nunique'],  # Monetary: 不同商品数
    'common_ts': ['min', 'max'],  # 首次和末次行为时间
    'hour': ['mean', 'std', 'min', 'max'],  # 活跃时间模式
    'dayofweek': ['mean', 'std'],  # 活跃日期模式
})

# 扁平化列名
rfm_basic.columns = ['_'.join(col).strip() for col in rfm_basic.columns.values]
rfm_basic = rfm_basic.reset_index()

# 重命名核心特征
rfm_basic.rename(columns={
    'ts_<lambda_0>': 'recency_days',
    'ts_<lambda_1>': 'user_age_days',
    'eid_count': 'frequency_total',
    'eid_nunique': 'frequency_unique',
    'mid_nunique': 'monetary_unique',
    'common_ts_min': 'first_action_ts',
    'common_ts_max': 'last_action_ts',
    'hour_mean': 'avg_hour',
    'hour_std': 'hour_consistency',
    'hour_min': 'earliest_hour',
    'hour_max': 'latest_hour',
    'dayofweek_mean': 'avg_dayofweek',
    'dayofweek_std': 'dayofweek_consistency'
}, inplace=True)

# 计算衍生特征
rfm_basic['action_timespan_seconds'] = (rfm_basic['last_action_ts'] - rfm_basic['first_action_ts']) / 1000
rfm_basic['action_timespan_days'] = rfm_basic['action_timespan_seconds'] / (24 * 3600)
rfm_basic['actions_per_day'] = rfm_basic['frequency_total'] / (rfm_basic['action_timespan_days'] + 1)
rfm_basic['unique_action_ratio'] = rfm_basic['frequency_unique'] / rfm_basic['frequency_total']
rfm_basic['hour_range'] = rfm_basic['latest_hour'] - rfm_basic['earliest_hour']

# 用户活跃度分类
rfm_basic['user_activity_level'] = pd.cut(rfm_basic['frequency_total'], 
                                         bins=[0, 1, 5, 20, float('inf')], 
                                         labels=['低活跃', '中活跃', '高活跃', '超高活跃'])

rfm_basic['user_recency_level'] = pd.cut(rfm_basic['recency_days'], 
                                        bins=[0, 1, 7, 30, float('inf')], 
                                        labels=['最近', '一周内', '一月内', '较久前'])

print(f"RFM特征数量: {rfm_basic.shape[1]-1}")  # 减去did列
print("RFM特征创建完成！")


=== 增强RFM特征工程 ===
RFM特征数量: 20
RFM特征创建完成！
CPU times: total: 20.6 s
Wall time: 20.7 s


In [10]:
%%time
# 4. 交互特征和聚合统计特征（仅基于训练集）
print("=== 交互特征和聚合统计特征 ===")

# 设备-地域交互特征
device_location_stats = train_df.groupby(['device_brand', 'common_province']).size().reset_index(name='device_province_count')
train_df = pd.merge(train_df, device_location_stats, on=['device_brand', 'common_province'], how='left')

# 渠道-设备交互特征
channel_device_stats = train_df.groupby(['channel', 'device_brand']).size().reset_index(name='channel_device_count')
train_df = pd.merge(train_df, channel_device_stats, on=['channel', 'device_brand'], how='left')

# 运营商-地域交互特征
operator_location_stats = train_df.groupby(['operator', 'common_city']).size().reset_index(name='operator_city_count')
train_df = pd.merge(train_df, operator_location_stats, on=['operator', 'common_city'], how='left')

# 基于用户ID的聚合特征（更多维度）
user_advanced_stats = train_df.groupby('did').agg({
    'device_brand': lambda x: x.nunique(),
    'common_province': lambda x: x.nunique(),
    'common_city': lambda x: x.nunique(),
    'channel': lambda x: x.nunique(),
    'operator': lambda x: x.nunique(),
    'os_type': lambda x: x.nunique(),
    'appver': lambda x: x.nunique(),
    'ntt': ['mean', 'std', 'min', 'max'],
    'is_weekend': 'mean',
    'time_period': lambda x: x.nunique()
}).reset_index()

# 扁平化列名
user_advanced_stats.columns = ['did'] + ['_'.join(col).strip() if isinstance(col, tuple) else col 
                                       for col in user_advanced_stats.columns[1:]]

# 重命名特征
rename_dict = {
    'device_brand_<lambda>': 'user_device_diversity',
    'common_province_<lambda>': 'user_province_diversity', 
    'common_city_<lambda>': 'user_city_diversity',
    'channel_<lambda>': 'user_channel_diversity',
    'operator_<lambda>': 'user_operator_diversity',
    'os_type_<lambda>': 'user_os_diversity',
    'appver_<lambda>': 'user_appver_diversity',
    'time_period_<lambda>': 'user_time_diversity',
    'is_weekend_mean': 'weekend_activity_ratio'
}

for old_name, new_name in rename_dict.items():
    if old_name in user_advanced_stats.columns:
        user_advanced_stats.rename(columns={old_name: new_name}, inplace=True)

# 合并所有RFM和聚合特征
all_user_features = pd.merge(rfm_basic, user_advanced_stats, on='did', how='left')

print(f"总用户聚合特征数量: {all_user_features.shape[1]-1}")
print("交互特征和聚合统计特征创建完成！")


=== 交互特征和聚合统计特征 ===
总用户聚合特征数量: 33
交互特征和聚合统计特征创建完成！
CPU times: total: 1min 34s
Wall time: 1min 36s


In [11]:
%%time
# 5. 将特征应用到训练集和测试集
print("=== 特征合并和处理 ===")

# 合并RFM特征到训练集和测试集
train_df = pd.merge(train_df, all_user_features, on='did', how='left')

# 对测试集进行相同的交互特征处理
test_df = pd.merge(test_df, device_location_stats, on=['device_brand', 'common_province'], how='left')
test_df = pd.merge(test_df, channel_device_stats, on=['channel', 'device_brand'], how='left') 
test_df = pd.merge(test_df, operator_location_stats, on=['operator', 'common_city'], how='left')

# 对测试集应用用户聚合特征（只有在训练集中出现过的用户才有特征）
test_df = pd.merge(test_df, all_user_features, on='did', how='left')

# 为新用户填充缺失的聚合特征
print("为测试集中的新用户填充特征...")

# 获取所有RFM特征列名
rfm_feature_cols = [col for col in all_user_features.columns if col != 'did']

# 计算训练集特征的统计值用于填充
fill_values = {}
for col in rfm_feature_cols:
    if train_df[col].dtype in ['float64', 'int64']:
        fill_values[col] = train_df[col].median()
    else:
        fill_values[col] = train_df[col].mode()[0] if len(train_df[col].mode()) > 0 else train_df[col].iloc[0]

# 填充测试集的缺失值
for col in rfm_feature_cols:
    if col in test_df.columns:
        test_df[col].fillna(fill_values[col], inplace=True)

# 填充交互特征的缺失值
interaction_features = ['device_province_count', 'channel_device_count', 'operator_city_count']
for feature in interaction_features:
    if feature in test_df.columns:
        test_df[feature].fillna(1, inplace=True)  # 用1填充表示这是新的组合

print("特征合并完成！")
print(f"训练集shape: {train_df.shape}")
print(f"测试集shape: {test_df.shape}")


=== 特征合并和处理 ===
为测试集中的新用户填充特征...
特征合并完成！
训练集shape: (3429925, 61)
测试集shape: (1143309, 60)
CPU times: total: 7.14 s
Wall time: 7.3 s


In [12]:
%%time
# 6. 类别特征编码
print("=== 类别特征编码 ===")

# 需要编码的类别特征（包括新的时间分组特征）
cat_features = [
    'device_brand', 'ntt', 'operator', 'common_country',
    'common_province', 'common_city', 'appver', 'channel',
    'os_type', 'time_period', 'hour_group', 'minute_group',
    'user_activity_level', 'user_recency_level'
]

label_encoders = {}

for feature in cat_features:
    if feature in train_df.columns and feature in test_df.columns:
        le = LabelEncoder()
        
        # 合并训练集和测试集的所有类别
        all_values = pd.concat([train_df[feature], test_df[feature]]).astype(str)
        
        le.fit(all_values)
        label_encoders[feature] = le
        
        # 应用编码
        train_df[feature] = le.transform(train_df[feature].astype(str))
        test_df[feature] = le.transform(test_df[feature].astype(str))
        
        print(f"特征 {feature} 编码完成，类别数: {len(le.classes_)}")

# udmap处理：由于都是空JSON，我们创建一个简单的特征
train_df['udmap_is_empty'] = 1  # 所有都是空的，这个特征值对所有样本都是1
test_df['udmap_is_empty'] = 1

print("类别特征编码完成！")


=== 类别特征编码 ===
特征 device_brand 编码完成，类别数: 213
特征 ntt 编码完成，类别数: 6
特征 operator 编码完成，类别数: 4
特征 common_country 编码完成，类别数: 112
特征 common_province 编码完成，类别数: 275
特征 common_city 编码完成，类别数: 455
特征 appver 编码完成，类别数: 108
特征 channel 编码完成，类别数: 18
特征 os_type 编码完成，类别数: 2
特征 time_period 编码完成，类别数: 4
特征 hour_group 编码完成，类别数: 3
特征 minute_group 编码完成，类别数: 4
特征 user_activity_level 编码完成，类别数: 4
特征 user_recency_level 编码完成，类别数: 4
类别特征编码完成！
CPU times: total: 35.6 s
Wall time: 36.4 s


In [13]:
%%time
# 7. 特征选择和准备
print("=== 特征选择和准备 ===")

# 定义完整的特征列表
features = [
    # 原始特征
    'mid', 'eid', 'device_brand', 'ntt', 'operator', 
    'common_country', 'common_province', 'common_city',
    'appver', 'channel', 'os_type',
    
    # 基础时间特征
    'hour', 'dayofweek', 'day', 'minute', 'common_ts',
    
    # 增强时间特征
    'is_weekend', 'is_workday', 'time_period', 'hour_group', 'minute_group',
    
    # RFM基础特征
    'recency_days', 'user_age_days', 'frequency_total', 'frequency_unique',
    'monetary_unique', 'first_action_ts', 'last_action_ts',
    
    # RFM衍生特征
    'action_timespan_seconds', 'action_timespan_days', 'actions_per_day',
    'unique_action_ratio', 'hour_range',
    
    # 时间模式特征
    'avg_hour', 'hour_consistency', 'earliest_hour', 'latest_hour',
    'avg_dayofweek', 'dayofweek_consistency', 'weekend_activity_ratio',
    
    # 用户多样性特征
    'user_device_diversity', 'user_province_diversity', 'user_city_diversity',
    'user_channel_diversity', 'user_operator_diversity', 'user_os_diversity',
    'user_appver_diversity', 'user_time_diversity',
    
    # 用户分类特征
    'user_activity_level', 'user_recency_level',
    
    # NTT统计特征
    'ntt_mean', 'ntt_std', 'ntt_min', 'ntt_max',
    
    # 交互特征
    'device_province_count', 'channel_device_count', 'operator_city_count',
    
    # udmap特征
    'udmap_is_empty'
]

# 检查哪些特征实际存在
available_features = []
missing_features = []

for feature in features:
    if feature in train_df.columns and feature in test_df.columns:
        available_features.append(feature)
    else:
        missing_features.append(feature)

print(f"总共定义特征数: {len(features)}")
print(f"可用特征数: {len(available_features)}")
print(f"缺失特征: {missing_features}")

# 准备训练和测试数据
X_train = train_df[available_features]
y_train = train_df['is_new_did']
X_test = test_df[available_features]

print(f"\n训练集特征shape: {X_train.shape}")
print(f"测试集特征shape: {X_test.shape}")

# 检查缺失值
print(f"\n训练集缺失值:")
missing_train = X_train.isnull().sum()
if missing_train.sum() > 0:
    print(missing_train[missing_train > 0])
else:
    print("无缺失值")

print(f"\n测试集缺失值:")
missing_test = X_test.isnull().sum()
if missing_test.sum() > 0:
    print(missing_test[missing_test > 0])
else:
    print("无缺失值")

# 清理不需要的列
for df in [train_df, test_df]:
    if 'ts' in df.columns:
        df.drop(['ts'], axis=1, inplace=True)

print("特征准备完成！")


=== 特征选择和准备 ===
总共定义特征数: 58
可用特征数: 58
缺失特征: []

训练集特征shape: (3429925, 58)
测试集特征shape: (1143309, 58)

训练集缺失值:
hour_consistency         51053
dayofweek_consistency    51053
ntt_std                  51053
dtype: int64

测试集缺失值:
无缺失值
特征准备完成！
CPU times: total: 2.2 s
Wall time: 2.16 s


In [14]:
# F1阈值优化函数
def find_optimal_threshold(y_true, y_pred_proba):
    """寻找最大化F1分数的阈值"""
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
        y_pred = (y_pred_proba >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

# 增强的模型参数
enhanced_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbose': -1,
    'n_jobs': 8,
    'seed': 42,
    'max_depth': 15,
    'num_leaves': 255,
    'learning_rate': 0.08,  # 稍微降低学习率
    'min_child_samples': 20,  # 增加最小样本数
    'feature_fraction': 0.85,  # 降低特征采样比例
    'bagging_fraction': 0.8,
    'bagging_freq': 3,
    'lambda_l1': 0.1,  # 添加L1正则化
    'lambda_l2': 0.1,  # 添加L2正则化
    'min_gain_to_split': 0.02  # 添加分割增益阈值
}

print("模型参数配置完成！")


模型参数配置完成！


In [15]:
%%time
# 8. 增强特征模型训练
print("=== 开始训练增强特征模型 ===")

n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
fold_thresholds = []
fold_f1_scores = []
models = []
oof_preds = np.zeros(len(X_train))
oof_probas = np.zeros(len(X_train))

# 特征重要性存储
feature_importance_df = pd.DataFrame()

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n======= Fold {fold+1}/{n_folds} =======")
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # 创建数据集
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    # 模型训练
    model = lgb.train(
        enhanced_params, train_set,
        num_boost_round=1500,  # 增加训练轮数
        valid_sets=[train_set, val_set],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),  # 增加早停轮数
            lgb.log_evaluation(period=200)
        ]
    )
    models.append(model)
    
    # 记录特征重要性
    fold_importance = pd.DataFrame()
    fold_importance['feature'] = X_train.columns
    fold_importance['importance'] = model.feature_importance(importance_type='gain')
    fold_importance['fold'] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance], axis=0)
    
    # 验证集预测
    val_pred_proba = model.predict(X_val)
    oof_probas[val_idx] = val_pred_proba
    
    # 阈值优化
    best_threshold, best_f1 = find_optimal_threshold(y_val, val_pred_proba)
    fold_thresholds.append(best_threshold)
    
    # 使用优化阈值计算F1
    val_pred_labels = (val_pred_proba >= best_threshold).astype(int)
    fold_f1 = f1_score(y_val, val_pred_labels)
    fold_f1_scores.append(fold_f1)
    oof_preds[val_idx] = val_pred_labels
    
    print(f"Fold {fold+1} Optimal Threshold: {best_threshold:.4f}")
    print(f"Fold {fold+1} F1 Score: {fold_f1:.5f}")
    print(f"Fold {fold+1} Best Iteration: {model.best_iteration}")
    
    # 测试集预测
    test_preds += model.predict(X_test) / n_folds

print("\n=== 模型训练完成 ===")


=== 开始训练增强特征模型 ===

[200]	training's binary_logloss: 0.106022	valid_1's binary_logloss: 0.1103
[400]	training's binary_logloss: 0.0808648	valid_1's binary_logloss: 0.0878177
[600]	training's binary_logloss: 0.0645734	valid_1's binary_logloss: 0.0734504
[800]	training's binary_logloss: 0.0530016	valid_1's binary_logloss: 0.0636413
[1000]	training's binary_logloss: 0.0438037	valid_1's binary_logloss: 0.055562
[1200]	training's binary_logloss: 0.0364335	valid_1's binary_logloss: 0.0492146
[1400]	training's binary_logloss: 0.0306712	valid_1's binary_logloss: 0.0441847
Fold 1 Optimal Threshold: 0.4500
Fold 1 F1 Score: 0.96194
Fold 1 Best Iteration: 1500

[200]	training's binary_logloss: 0.106543	valid_1's binary_logloss: 0.111079
[400]	training's binary_logloss: 0.0808434	valid_1's binary_logloss: 0.0879143
[600]	training's binary_logloss: 0.0642725	valid_1's binary_logloss: 0.0732441
[800]	training's binary_logloss: 0.0523375	valid_1's binary_logloss: 0.0629595
[1000]	training's binary_log

In [16]:
# 整体结果评估
avg_threshold = np.mean(fold_thresholds)
final_oof_preds = (oof_probas >= avg_threshold).astype(int)
final_f1 = f1_score(y_train, final_oof_preds)

print("\n===== 修复数据泄露后的最终结果 =====")
print(f"Average Optimal Threshold: {avg_threshold:.4f}")
print(f"Fold F1 Scores: {[f'{s:.5f}' for s in fold_f1_scores]}")
print(f"Average Fold F1: {np.mean(fold_f1_scores):.5f}")
print(f"OOF F1 Score: {final_f1:.5f}")

# 测试集预测与提交文件生成
test_pred_labels = (test_preds >= avg_threshold).astype(int)
submit['is_new_did'] = test_pred_labels

# 保存提交文件
submit[['is_new_did']].to_csv('submit_fixed.csv', index=False)
print("\n修复数据泄露后的提交文件已保存: submit_fixed.csv")
print(f"预测新用户比例: {test_pred_labels.mean():.4f}")
print(f"测试集大小: {len(test_pred_labels)}")




===== 修复数据泄露后的最终结果 =====
Average Optimal Threshold: 0.4600
Fold F1 Scores: ['0.96194', '0.96117', '0.96185', '0.96127', '0.96183']
Average Fold F1: 0.96161
OOF F1 Score: 0.96177

修复数据泄露后的提交文件已保存: submit_fixed.csv
预测新用户比例: 0.1486
测试集大小: 1143309
