In [1]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [2]:
%%time
# 1. 数据加载
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./testA_data.csv')
submit = test_df[['did']]

full_df = pd.concat([train_df, test_df], axis=0)

# 2. 时间特征工程
for df in [train_df, test_df]:
    # 转换为时间戳
    df['ts'] = pd.to_datetime(df['common_ts'], unit='ms')
    
    # 提取时间特征
    df['day'] = df['ts'].dt.day
    df['dayofweek'] = df['ts'].dt.dayofweek
    df['hour'] = df['ts'].dt.hour

# 对 full_df 进行同样的处理，用于计算全局聚合特征
full_df['ts'] = pd.to_datetime(full_df['common_ts'], unit='ms')
full_df['day'] = full_df['ts'].dt.day
full_df['dayofweek'] = full_df['ts'].dt.dayofweek
full_df['hour'] = full_df['ts'].dt.hour



CPU times: user 6.84 s, sys: 1.69 s, total: 8.53 s
Wall time: 8.56 s


In [3]:
%%time
# 3. RFM 特征工程

# 3.1 RFM 特征构建
# 我们在 full_df 上计算这些聚合特征，因为它包含了所有用户的所有行为
max_ts = full_df['ts'].max()
rfm_agg = full_df.groupby('did').agg({
    'ts': lambda x: (max_ts - x.max()).days, # Recency
    'eid': 'count', # Frequency
    'mid': 'nunique', # 行为深度
    'common_ts': ['min', 'max'] # 首次和末次行为时间
})

# 3.2 列名扁平化处理
# 将多级索引 ('common_ts', 'min') 合并为单级 'common_ts_min'
rfm_agg.columns = ['_'.join(col).strip() for col in rfm_agg.columns.values]
rfm_agg = rfm_agg.reset_index()

# 3.3 特征重命名
rfm_agg.rename(columns={
    'ts_<lambda>': 'recency',
    'eid_count': 'frequency',
    'mid_nunique': 'mid_nunique',
    'common_ts_min': 'first_action_ts',
    'common_ts_max': 'last_action_ts'
}, inplace=True)

# 3.4 派生新特征
# 计算首次和末次行为的时间跨度（单位：秒）
rfm_agg['action_timespan_seconds'] = (rfm_agg['last_action_ts'] - rfm_agg['first_action_ts']) / 1000

# 3.5 合并RFM特征到 train_df 和 test_df
# 为了确保后续流程的变量一致性，我们直接在 train_df 和 test_df 上合并
train_df = pd.merge(train_df, rfm_agg, on='did', how='left')
test_df = pd.merge(test_df, rfm_agg, on='did', how='left')

# 3.6 清理不再需要的 ts 列
for df in [train_df, test_df]:
    df.drop(['ts'], axis=1, inplace=True)

CPU times: user 20.2 s, sys: 786 ms, total: 21 s
Wall time: 21 s


In [4]:
%%time
# 需要编码的特征列表
cat_features = [
    'device_brand', 'ntt', 'operator', 'common_country',
    'common_province', 'common_city', 'appver', 'channel',
    'os_type', 'udmap'
]
# 初始化编码器字典
label_encoders = {}

for feature in cat_features:
    # 创建编码器，将类别特征转为0-N的自然数
    le = LabelEncoder()
    
    # 合并训练集和测试集的所有类别
    all_values = pd.concat([train_df[feature], test_df[feature]]).astype(str)
    
    # 训练编码器（使用所有可能值）
    le.fit(all_values)
    
    # 保存编码器
    label_encoders[feature] = le
    
    # 应用编码
    train_df[feature] = le.transform(train_df[feature].astype(str))
    test_df[feature] = le.transform(test_df[feature].astype(str))

CPU times: user 17.7 s, sys: 2.12 s, total: 19.8 s
Wall time: 19.8 s


In [5]:
%%time
# 基础特征 + 目标编码特征 + 聚合特征
features = [
    # 原始特征
    'mid', 'eid', 'device_brand', 'ntt', 'operator', 
    'common_country', 'common_province', 'common_city',
    'appver', 'channel', 'os_type', 'udmap',
    # 时间特征
    'hour', 'dayofweek', 'day', 'common_ts',
    # RFM特征
   'recency', 'frequency', 'mid_nunique', 'first_action_ts', 'last_action_ts', 'action_timespan_seconds'
]

# 准备训练和测试数据
X_train = train_df[features]
y_train = train_df['is_new_did']
X_test = test_df[features]

CPU times: user 151 ms, sys: 234 ms, total: 385 ms
Wall time: 384 ms


In [6]:
%%time
# 6. F1阈值优化函数
def find_optimal_threshold(y_true, y_pred_proba):
    """寻找最大化F1分数的阈值"""
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in [0.1,0.15,0.2,0.25,0.3,0.35,0.4]:
        y_pred = (y_pred_proba >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

# 7. 网格搜索参数优化
import time
from itertools import product

def evaluate_params(params_dict, X_train, y_train, n_folds=3):
    """评估单组参数的性能"""
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    fold_f1_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # 创建数据集
        train_set = lgb.Dataset(X_tr, label=y_tr)
        val_set = lgb.Dataset(X_val, label=y_val)
        
        # 模型训练
        model = lgb.train(
            params_dict, train_set,
            num_boost_round=500,  # 减少轮数以加快搜索
            valid_sets=[val_set],
            callbacks=[
                lgb.early_stopping(stopping_rounds=30, verbose=False),
                lgb.log_evaluation(period=0)  # 不输出训练日志
            ]
        )
        
        # 预测并计算F1
        val_pred_proba = model.predict(X_val)
        best_threshold, best_f1 = find_optimal_threshold(y_val, val_pred_proba)
        fold_f1_scores.append(best_f1)
    
    return np.mean(fold_f1_scores)

def grid_search_lgb(X_train, y_train):
    """LightGBM网格搜索"""
    print("开始网格搜索参数优化...")
    
    # 基础参数
    base_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': -1,
        'n_jobs': 8,
        'seed': 42
    }
    
    # 第一阶段：搜索核心结构参数
    print("\n=== 第一阶段：搜索核心结构参数 ===")
    structure_params = {
        'max_depth': [8, 10, 12, 15],
        'num_leaves': [31, 63, 127, 255],
        'learning_rate': [0.05, 0.1],
        'min_child_samples': [10, 20]
    }
    
    best_score = 0
    best_structure_params = {}
    
    total_combinations = len(list(product(*structure_params.values())))
    current_combination = 0
    
    for max_depth, num_leaves, lr, min_child in product(*structure_params.values()):
        current_combination += 1
        
        # 确保 num_leaves < 2^max_depth
        if num_leaves >= 2**max_depth:
            continue
            
        params = base_params.copy()
        params.update({
            'max_depth': max_depth,
            'num_leaves': num_leaves,
            'learning_rate': lr,
            'min_child_samples': min_child,
            'feature_fraction': 0.7,
            'bagging_fraction': 0.8,
            'bagging_freq': 5
        })
        
        try:
            score = evaluate_params(params, X_train, y_train, n_folds=3)
            print(f"进度: {current_combination}/{total_combinations} | "
                  f"max_depth={max_depth}, num_leaves={num_leaves}, lr={lr}, min_child={min_child} | "
                  f"F1={score:.5f}")
            
            if score > best_score:
                best_score = score
                best_structure_params = {
                    'max_depth': max_depth,
                    'num_leaves': num_leaves,
                    'learning_rate': lr,
                    'min_child_samples': min_child
                }
                print(f"*** 新的最佳结果! F1={best_score:.5f} ***")
                
        except Exception as e:
            print(f"参数组合出错: {e}")
            continue
    
    print(f"\n第一阶段最佳参数: {best_structure_params}")
    print(f"第一阶段最佳F1: {best_score:.5f}")
    
    # 第二阶段：基于最佳结构参数，搜索正则化参数
    print("\n=== 第二阶段：搜索正则化参数 ===")
    regularization_params = {
        'feature_fraction': [0.6, 0.7, 0.8, 0.9],
        'bagging_fraction': [0.7, 0.8, 0.9],
        'bagging_freq': [3, 5, 7]
    }
    
    best_final_score = best_score
    best_final_params = best_structure_params.copy()
    best_final_params.update({'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5})
    
    for feature_frac, bagging_frac, bagging_freq in product(*regularization_params.values()):
        params = base_params.copy()
        params.update(best_structure_params)
        params.update({
            'feature_fraction': feature_frac,
            'bagging_fraction': bagging_frac,
            'bagging_freq': bagging_freq
        })
        
        try:
            score = evaluate_params(params, X_train, y_train, n_folds=3)
            print(f"feature_frac={feature_frac}, bagging_frac={bagging_frac}, bagging_freq={bagging_freq} | F1={score:.5f}")
            
            if score > best_final_score:
                best_final_score = score
                best_final_params = params.copy()
                print(f"*** 新的最佳结果! F1={best_final_score:.5f} ***")
                
        except Exception as e:
            print(f"参数组合出错: {e}")
            continue
    
    print(f"\n最终最佳参数: {best_final_params}")
    print(f"最终最佳F1: {best_final_score:.5f}")
    
    return best_final_params

# 执行网格搜索
optimal_params = grid_search_lgb(X_train, y_train)

开始网格搜索参数优化...

=== 第一阶段：搜索核心结构参数 ===
进度: 1/64 | max_depth=8, num_leaves=31, lr=0.05, min_child=10 | F1=0.77430
*** 新的最佳结果! F1=0.77430 ***
进度: 2/64 | max_depth=8, num_leaves=31, lr=0.05, min_child=20 | F1=0.77424
进度: 3/64 | max_depth=8, num_leaves=31, lr=0.1, min_child=10 | F1=0.79413
*** 新的最佳结果! F1=0.79413 ***
进度: 4/64 | max_depth=8, num_leaves=31, lr=0.1, min_child=20 | F1=0.79383
进度: 5/64 | max_depth=8, num_leaves=63, lr=0.05, min_child=10 | F1=0.79375
进度: 6/64 | max_depth=8, num_leaves=63, lr=0.05, min_child=20 | F1=0.79341
进度: 7/64 | max_depth=8, num_leaves=63, lr=0.1, min_child=10 | F1=0.81757
*** 新的最佳结果! F1=0.81757 ***
进度: 8/64 | max_depth=8, num_leaves=63, lr=0.1, min_child=20 | F1=0.81755
进度: 9/64 | max_depth=8, num_leaves=127, lr=0.05, min_child=10 | F1=0.81161
进度: 10/64 | max_depth=8, num_leaves=127, lr=0.05, min_child=20 | F1=0.81116
进度: 11/64 | max_depth=8, num_leaves=127, lr=0.1, min_child=10 | F1=0.84097
*** 新的最佳结果! F1=0.84097 ***
进度: 12/64 | max_depth=8, num_leaves=127, 

In [7]:
%%time
# 8. 使用最优参数进行最终模型训练
print("\n开始使用最优参数进行最终模型训练...")

# 五折交叉验证，使用五折构建特征时的切分规则，保证切分一致
n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
fold_thresholds = []
fold_f1_scores = []
models = []
oof_preds = np.zeros(len(X_train))
oof_probas = np.zeros(len(X_train))

print(f"\n使用最优参数: {optimal_params}")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n======= Fold {fold+1}/{n_folds} =======")
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # 创建数据集
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    # 模型训练（使用最优参数）
    model = lgb.train(
        optimal_params, train_set,
        num_boost_round=1000,
        valid_sets=[train_set, val_set],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )
    models.append(model)
    
    # 验证集预测
    val_pred_proba = model.predict(X_val)
    oof_probas[val_idx] = val_pred_proba
    
    # 阈值优化
    best_threshold, best_f1 = find_optimal_threshold(y_val, val_pred_proba)
    fold_thresholds.append(best_threshold)
    
    # 使用优化阈值计算F1
    val_pred_labels = (val_pred_proba >= best_threshold).astype(int)
    fold_f1 = f1_score(y_val, val_pred_labels)
    fold_f1_scores.append(fold_f1)
    oof_preds[val_idx] = val_pred_labels
    
    print(f"Fold {fold+1} Optimal Threshold: {best_threshold:.4f}")
    print(f"Fold {fold+1} F1 Score: {fold_f1:.5f}")
    
    # 测试集预测
    test_preds += model.predict(X_test) / n_folds



开始使用最优参数进行最终模型训练...

使用最优参数: {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, 'n_jobs': 8, 'seed': 42, 'max_depth': 15, 'num_leaves': 255, 'learning_rate': 0.1, 'min_child_samples': 10, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 3}

[100]	training's binary_logloss: 0.128682	valid_1's binary_logloss: 0.131402
[200]	training's binary_logloss: 0.10665	valid_1's binary_logloss: 0.111234
[300]	training's binary_logloss: 0.0927849	valid_1's binary_logloss: 0.0989797
[400]	training's binary_logloss: 0.0821904	valid_1's binary_logloss: 0.0897463
[500]	training's binary_logloss: 0.0734899	valid_1's binary_logloss: 0.0821717
[600]	training's binary_logloss: 0.0660219	valid_1's binary_logloss: 0.0756615
[700]	training's binary_logloss: 0.0596661	valid_1's binary_logloss: 0.0701872
[800]	training's binary_logloss: 0.0541556	valid_1's binary_logloss: 0.0654413
[900]	training's binary_logloss: 0.0495781	valid_1's binary_logloss: 0.0616011
[1000]	training's b

In [8]:
# 9. 整体结果评估
# 使用交叉验证平均阈值
avg_threshold = np.mean(fold_thresholds)
final_oof_preds = (oof_probas >= avg_threshold).astype(int)
final_f1 = f1_score(y_train, final_oof_preds)

print("\n===== 优化后的最终结果 =====")
print(f"使用的最优参数: {optimal_params}")
print(f"Average Optimal Threshold: {avg_threshold:.4f}")
print(f"Fold F1 Scores: {[f'{s:.5f}' for s in fold_f1_scores]}")
print(f"Average Fold F1: {np.mean(fold_f1_scores):.5f}")
print(f"OOF F1 Score: {final_f1:.5f}")

# 10. 测试集预测与提交文件生成
# 使用平均阈值进行预测
test_pred_labels = (test_preds >= avg_threshold).astype(int)
submit['is_new_did'] = test_pred_labels

# 保存提交文件
submit[['is_new_did']].to_csv('submit_optimized.csv', index=False)
print("\nOptimized submission file saved: submit_optimized.csv")
print(f"Predicted new user ratio: {test_pred_labels.mean():.4f}")
print(f"Test set size: {len(test_pred_labels)}")

# 11. 特征重要性分析（使用优化后的模型）
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': models[0].feature_importance(importance_type='gain')
}).sort_values('Importance', ascending=False)

print("\n===== 优化后模型的特征重要性 =====")
print(feature_importance.head(15))



===== 优化后的最终结果 =====
使用的最优参数: {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1, 'n_jobs': 8, 'seed': 42, 'max_depth': 15, 'num_leaves': 255, 'learning_rate': 0.1, 'min_child_samples': 10, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 3}
Average Optimal Threshold: 0.4000
Fold F1 Scores: ['0.93985', '0.93913', '0.94024', '0.93881', '0.94081']
Average Fold F1: 0.93977
OOF F1 Score: 0.93977

Optimized submission file saved: submit_optimized.csv
Predicted new user ratio: 0.1597
Test set size: 1143309

===== 优化后模型的特征重要性 =====
                    Feature    Importance
21  action_timespan_seconds  2.931054e+06
19          first_action_ts  1.426011e+06
15                common_ts  1.253119e+06
17                frequency  5.197655e+05
8                    appver  3.937382e+05
18              mid_nunique  3.733504e+05
7               common_city  3.031883e+05
20           last_action_ts  2.608818e+05
0                       mid  2.323566e+05
14              

In [9]:
# 12. 网格搜索结果总结与建议
print("\n===== 网格搜索优化总结 =====")
print(f"原始参数组合 vs 优化后参数组合对比:")
print("原始参数:")
print("- max_depth: 12")
print("- num_leaves: 63") 
print("- learning_rate: 0.1")
print("- feature_fraction: 0.7")
print("- bagging_fraction: 0.8")
print("- min_child_samples: 10")

print(f"\n优化后参数:")
for key, value in optimal_params.items():
    if key not in ['objective', 'metric', 'verbose', 'n_jobs', 'seed']:
        print(f"- {key}: {value}")

print(f"\n网格搜索说明:")
print("1. 第一阶段: 搜索核心结构参数 (max_depth, num_leaves, learning_rate, min_child_samples)")
print("2. 第二阶段: 基于最佳结构参数，搜索正则化参数 (feature_fraction, bagging_fraction, bagging_freq)")
print("3. 使用3折交叉验证评估每个参数组合，减少计算时间")
print("4. 最终使用最优参数组合进行5折交叉验证得到最终结果")

print(f"\n性能提升分析:")
print("通过网格搜索找到的最优参数组合可能会带来:")
print("- 更好的模型泛化能力")
print("- 减少过拟合风险") 
print("- 提高F1分数")
print("- 更稳定的预测结果")


===== 网格搜索优化总结 =====
原始参数组合 vs 优化后参数组合对比:
原始参数:
- max_depth: 12
- num_leaves: 63
- learning_rate: 0.1
- feature_fraction: 0.7
- bagging_fraction: 0.8
- min_child_samples: 10

优化后参数:
- max_depth: 15
- num_leaves: 255
- learning_rate: 0.1
- min_child_samples: 10
- feature_fraction: 0.9
- bagging_fraction: 0.8
- bagging_freq: 3

网格搜索说明:
1. 第一阶段: 搜索核心结构参数 (max_depth, num_leaves, learning_rate, min_child_samples)
2. 第二阶段: 基于最佳结构参数，搜索正则化参数 (feature_fraction, bagging_fraction, bagging_freq)
3. 使用3折交叉验证评估每个参数组合，减少计算时间
4. 最终使用最优参数组合进行5折交叉验证得到最终结果

性能提升分析:
通过网格搜索找到的最优参数组合可能会带来:
- 更好的模型泛化能力
- 减少过拟合风险
- 提高F1分数
- 更稳定的预测结果


In [13]:
# 可选：贝叶斯优化方法（需要安装 scikit-optimize）
# 取消注释下面的代码块来使用贝叶斯优化替代网格搜索

"""
# 安装依赖包
!pip install scikit-optimize

from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

def bayesian_optimization_lgb(X_train, y_train, n_calls=50):
    '''贝叶斯优化LightGBM参数'''
    print("开始贝叶斯优化参数搜索...")
    
    # 定义搜索空间
    search_space = [
        Integer(6, 16, name='max_depth'),
        Integer(15, 511, name='num_leaves'),
        Real(0.01, 0.3, name='learning_rate'),
        Real(0.5, 1.0, name='feature_fraction'),
        Real(0.5, 1.0, name='bagging_fraction'),
        Integer(3, 10, name='bagging_freq'),
        Integer(5, 50, name='min_child_samples')
    ]
    
    # 基础参数
    base_params = {
        'objective': 'binary',
        'metric': 'binary_logloss', 
        'verbose': -1,
        'n_jobs': 8,
        'seed': 42
    }
    
    @use_named_args(search_space)
    def objective(**params):
        # 确保 num_leaves < 2^max_depth
        if params['num_leaves'] >= 2**params['max_depth']:
            return 1.0  # 返回一个差的分数
        
        # 合并参数
        lgb_params = base_params.copy()
        lgb_params.update(params)
        
        try:
            # 评估参数
            score = evaluate_params(lgb_params, X_train, y_train, n_folds=3)
            # 贝叶斯优化是最小化，所以返回负的F1分数
            return -score
        except:
            return 1.0  # 如果出错，返回差的分数
    
    # 执行贝叶斯优化
    result = gp_minimize(func=objective,
                        dimensions=search_space,
                        n_calls=n_calls,
                        random_state=42,
                        verbose=True)
    
    # 提取最优参数
    best_params = base_params.copy()
    param_names = ['max_depth', 'num_leaves', 'learning_rate', 
                   'feature_fraction', 'bagging_fraction', 'bagging_freq', 'min_child_samples']
    
    for i, param_name in enumerate(param_names):
        best_params[param_name] = result.x[i]
    
    print(f"贝叶斯优化最佳F1分数: {-result.fun:.5f}")
    print(f"贝叶斯优化最佳参数: {best_params}")
    
    return best_params

# 使用贝叶斯优化（取消注释下面这行来启用）
# optimal_params_bayes = bayesian_optimization_lgb(X_train, y_train, n_calls=50)


print("贝叶斯优化代码已准备好，取消注释即可使用")
print("贝叶斯优化相比网格搜索的优势:")
print("1. 更高效：基于先验知识智能选择下一个搜索点")
print("2. 更适合连续参数：可以搜索实数空间")
print("3. 收敛更快：通常用更少的评估次数找到更好的参数")
print("4. 适合高维搜索：当参数维度较高时表现更好")"""


Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
贝叶斯优化代码已准备好，取消注释即可使用
贝叶斯优化相比网格搜索的优势:
1. 更高效：基于先验知识智能选择下一个搜索点
2. 更适合连续参数：可以搜索实数空间
3. 收敛更快：通常用更少的评估次数找到更好的参数
4. 适合高维搜索：当参数维度较高时表现更好
