In [None]:
# 基础数据探索
import numpy as np
import pandas as pd

# 加载数据集
taobao_ad_data = pd.read_csv('round1_ijcai_18_train_20180301F.csv')

#显示数据集前10行
taobao_ad_data.head(10)

#显示每列的数据类型
taobao_ad_data.dtypes

#显示共有几行几列
taobao_ad_data.shape

# 统计至少包含一个-1的总行数
rows_with_minus_one = (taobao_ad_data == -1).any(axis=1).sum()
print(f"\n数据中至少包含一个-1的总行数: {rows_with_minus_one}")
print(f"占总行数的比例: {rows_with_minus_one/len(taobao_ad_data):.2%}")

# 删除包含-1的行
clean_data = taobao_ad_data[~(taobao_ad_data == -1).any(axis=1)].copy()

# 重建连续索引
clean_data.reset_index(drop=True, inplace=True)

# 验证结果
print(f"清理后行数: {len(clean_data)}")
clean_data.shape

# 定义需要保留的列
selected_columns = [
    'item_id',
    'item_category_list',
    'item_brand_id',
    'item_city_id',
    'item_price_level',
    'item_sales_level',
    'item_collected_level',
    'item_pv_level',
    'user_gender_id',
    'user_age_level',
    'user_star_level',
    'context_timestamp',
    'context_page_id',
    'predict_category_property',
    'shop_review_positive_rate',
    'shop_score_service',
    'shop_score_delivery',
    'shop_score_description',
    'is_trade'
]

# 筛选列
filtered_data = clean_data[selected_columns]

# 检查数据
filtered_data.head(10)

# 拆分item_category_list
filtered_data[['category_0', 'category_1', 'category_2']] = (
    filtered_data['item_category_list'].str.split(';', expand=True))

# 时间特征提取
filtered_data['timestamp'] = pd.to_datetime(filtered_data['context_timestamp'], unit='s')
filtered_data['hour'] = filtered_data['timestamp'].dt.hour
filtered_data['is_weekend'] = filtered_data['timestamp'].dt.weekday >= 5

# 检查predict_category_property是否包含商品类目
filtered_data['category_match'] = filtered_data.apply(
    lambda row: str(row['category_1']) in str(row['predict_category_property']), axis=1)

filtered_data['item_id'] = filtered_data['item_id'].astype('category')
# 检查数据
filtered_data.head(10)

null_ratio = filtered_data['category_2'].isnull().mean()
print(f"category_2 空值比例: {null_ratio:.2%}")

# 删除 category_2 列（直接操作）
filtered_data.drop('category_2', axis=1, inplace=True)

# 1. 删除已拆分的多值字段
filtered_data.drop('item_category_list', axis=1, inplace=True)

# 2. 时间戳处理（选择性删除）
if 'timestamp' in filtered_data.columns:  # 确保已生成timestamp列
    filtered_data.drop('context_timestamp', axis=1, inplace=True)
# 检查数据
filtered_data.head(10)

import matplotlib.pyplot as plt
import seaborn as sns

# 选择数值型特征
num_cols = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']
filtered_data[num_cols].hist(bins=20, figsize=(12, 6))
plt.tight_layout()
plt.show()

# 检查二级类目（category_1）的分布
top_categories = filtered_data['category_1'].value_counts().nlargest(20)  # 取前20（因类目可能较多）
plt.figure(figsize=(12, 6))
sns.barplot(x=top_categories.index, y=top_categories.values, palette="viridis")
plt.xticks(rotation=45, ha='right')  # 调整标签角度
plt.title('Top_20_Secondary_category_distribution（category_1）')
plt.ylabel('sample_size') # 样本数量
plt.tight_layout()  # 避免标签重叠
plt.show()

# 打印类目数量和占比
print(f"二级类目总数: {filtered_data['category_1'].nunique()}")
print("Top 5 二级类目占比:")
print((top_categories.head(5) / len(filtered_data)).round(4))

# 1. 获取TOP5类目
top_5_categories = filtered_data['category_1'].value_counts().nlargest(5).index

# 2. 合并非TOP5类目为"other"
filtered_data['category_1'] = filtered_data['category_1'].apply(
    lambda x: x if x in top_5_categories else 'other'
)

# 3. 检查结果
print("合并后的类目分布:")
print(filtered_data['category_1'].value_counts(normalize=True))

# 4. 检查TOP5类目的转化率
print("\nTOP5类目转化率:")
print(filtered_data.groupby('category_1')['is_trade'].mean().sort_values(ascending=False))

# 1. 计算数值特征与目标的相关性
num_cols = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']
corr_series = filtered_data[num_cols + ['is_trade']].corr()['is_trade'].drop('is_trade')

# 2. 按相关性绝对值排序（从高到低）
corr_series = corr_series.sort_values(key=abs, ascending=False)

# 3. 绘制横向条形图
plt.figure(figsize=(10, 4))
bars = plt.barh(corr_series.index, corr_series.values, color='steelblue')

# 4. 添加数值标签
for bar in bars:
    width = bar.get_width()
    plt.text(width if width > 0 else width,  # 标签位置
            bar.get_y() + bar.get_height()/2,  # 垂直居中
            f'{width:.3f}',  # 保留3位小数
            va='center', ha='left' if width > 0 else 'right',
            fontsize=10)

# 5. 图表修饰
plt.title('Numerical Features Correlation with is_trade', fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.xlim(-0.15, 0.15)  # 根据实际数据调整范围
plt.axvline(0, color='gray', linestyle='--', linewidth=0.8)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

# 计算各价格等级的转化率
price_conversion = filtered_data.groupby('item_price_level')['is_trade'].mean().sort_values(ascending=False)

# 创建图形
plt.figure(figsize=(10, 6))

# 创建条形图（使用seaborn的默认样式）
ax = sns.barplot(x=price_conversion.index,
                y=price_conversion.values,
                color='steelblue')  # 使用固定颜色替代渐变色

# 添加数值标签
for i, v in enumerate(price_conversion.values):
    ax.text(i, v + 0.005, f"{v:.3f}",  # 保留3位小数
            ha='center',
            va='bottom',
            fontsize=10)

# 设置图表标题和标签
plt.title('Conversion Rate by Price Level', fontsize=14, pad=20)
plt.xlabel('Price Level', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)

# 调整坐标轴
plt.ylim(0, price_conversion.max() * 1.2)  # 留出空间显示标签
plt.xticks(rotation=0)  # 保持水平标签

# 显示网格线
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# 1.1 用户性别与商品类别的交互
filtered_data['gender_category'] = filtered_data['user_gender_id'].astype(str) + '_' + filtered_data['category_1'].astype(str)

# 1.2 用户年龄段与价格敏感度
filtered_data['age_price_ratio'] = filtered_data['user_age_level'] / (filtered_data['item_price_level'] + 1)

# 2.1 商品销量与店铺评分的综合指标
filtered_data['sales_service_score'] = filtered_data['item_sales_level'] * filtered_data['shop_score_service']

# 2.2 商品被收藏与店铺物流评分的交互
filtered_data['collect_delivery_score'] = filtered_data['item_collected_level'] * filtered_data['shop_score_delivery']

# 3.1 时段与类目的交互
filtered_data['hour_category'] = filtered_data['hour'].astype(str) + '_' + filtered_data['category_1'].astype(str)

# 3.2 是否周末与商品价格等级
filtered_data['weekend_price'] = filtered_data['is_weekend'].astype(int) * filtered_data['item_price_level']

# 1.1 类目平均价格等级（用当前数据近似）
category_avg_price = filtered_data.groupby('category_1')['item_price_level'].mean().to_dict()
filtered_data['category_avg_price'] = filtered_data['category_1'].map(category_avg_price)

# 1.2 类目竞争度（同类商品数量）
category_competition = filtered_data['category_1'].value_counts().to_dict()
filtered_data['category_competition'] = filtered_data['category_1'].map(category_competition)

# 2.1 用用户星级模拟活跃度
filtered_data['user_activeness'] = filtered_data['user_star_level'] / filtered_data['user_star_level'].max()

# 2.2 用商品曝光等级模拟点击率
filtered_data['simulated_ctr'] = filtered_data['item_pv_level'] / filtered_data['item_pv_level'].max()


# 对高基数文本列进行频数编码（比LabelEncoding更适合树模型）
for col in ['predict_category_property', 'category_0', 'category_1', 'gender_category', 'hour_category']:
    if col in filtered_data.columns:
        # 频数编码：用出现频率代替原始值
        freq_encoder = filtered_data[col].value_counts(normalize=True)
        filtered_data[col+'_freq'] = filtered_data[col].map(freq_encoder)
        filtered_data.drop(col, axis=1, inplace=True)

# 时间戳特征增强（已存在hour列的情况下补充更多时间特征）
if 'timestamp' in filtered_data.columns:
    filtered_data['day_of_week'] = filtered_data['timestamp'].dt.dayofweek  # 周一=0，周日=6
    filtered_data['is_morning'] = filtered_data['hour'].between(6, 12).astype(int)
    filtered_data.drop('timestamp', axis=1, inplace=True)

# 将bool类型显式转换为int
bool_cols = ['is_weekend', 'category_match']
for col in bool_cols:
    if col in filtered_data.columns:
        filtered_data[col] = filtered_data[col].astype(int)

# 删除可能重复的列（如原始ID列已被编码）
cols_to_drop = [col for col in ['item_id', 'item_brand_id', 'category_1']
               if col in filtered_data.columns]
filtered_data.drop(cols_to_drop, axis=1, inplace=True)

from lightgbm import LGBMClassifier

# 确保目标列是int类型
filtered_data['is_trade'] = filtered_data['is_trade'].astype(int)

# 分割特征和目标
X = filtered_data.drop('is_trade', axis=1)
y = filtered_data['is_trade']

# 检查最终特征类型
print("最终特征类型：\n", X.dtypes)

# 训练模型
model = LGBMClassifier()
model.fit(X, y)

# 可视化特征重要性
plt.figure(figsize=(10,6))
(pd.Series(model.feature_importances_, index=X.columns)
 .nlargest(20)
 .plot.barh(title='Top 20 Feature Importance'))
plt.show()

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

# 划分训练集和测试集（保持正负样本比例）
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,  # 保持类别比例
    random_state=42
)

# 1. 清理无效特征（必须步骤）
X_train_clean = X_train.drop(columns=[X_train.columns[0], 'category_0_freq'], errors='ignore')

# 2. 设置针对不平衡数据的参数
model = LGBMClassifier(
    # 核心不平衡处理参数
    scale_pos_weight=318322/6188,  # 负样本数/正样本数 ≈ 51.4
    class_weight='balanced',

    # 结构调整
    num_leaves=15,           # 减少叶子数量（防止过拟合少数类）
    max_depth=3,             # 限制树深度
    min_data_in_leaf=100,    # 增加叶子最小样本量

    # 训练控制
    learning_rate=0.05,      # 降低学习率
    n_estimators=500,        # 增加树数量
    reg_alpha=0.1,           # L1正则化
    reg_lambda=0.1,          # L2正则化
    verbosity=-1             # 关闭警告
)

# 3. 训练模型
model.fit(X_train_clean, y_train)

# 4. 检查特征重要性（验证是否正常工作）
pd.Series(
    model.feature_importances_,
    index=X_train_clean.columns
).nlargest(20).plot.barh(title='Feature Importance')

# 检查模型是否学到有效模式
y_pred_proba = model.predict_proba(X_train_clean)[:, 1]
print("训练集AUC:", roc_auc_score(y_train, y_pred_proba))

# 如果AUC>0.7说明模型有效

# 1. 对测试集进行相同的特征清理（关键步骤！）
X_test_clean = X_test.drop(columns=[X_test.columns[0], 'category_0_freq'], errors='ignore')

# 2. 预测测试集概率（注意使用X_test_clean）
y_pred_proba_test = model.predict_proba(X_test_clean)[:, 1]

# 3. 处理极端概率值避免log(0)
y_pred_proba_test = np.clip(y_pred_proba_test, 1e-15, 1-1e-15)

# 4. 计算LogLoss
logloss = log_loss(y_test, y_pred_proba_test)
print(f"""
=== 测试集评估 ===
LogLoss: {logloss:.4f}
评估标准:
  <0.3 : 优秀
  0.3-0.5 : 合格
  >0.5 : 需改进
""")

# 创建评分分箱 (0-1区间，每0.1一个区间)
filtered_data['service_bin'] = pd.cut(
    filtered_data['shop_score_service'],
    bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    labels=False
)

# 计算每个分箱的转化率
conversion_by_service = filtered_data.groupby('service_bin')['is_trade'].agg(
    count='count',
    conversions='sum',
    conversion_rate='mean'
).reset_index()

print(conversion_by_service)

plt.figure(figsize=(10, 6))
plt.plot(
    conversion_by_service['service_bin'] * 0.1 + 0.05,  # 取分箱中点
    conversion_by_service['conversion_rate'],
    marker='o'
)
plt.xlabel('Shop Service Score (0.1 intervals)')
plt.ylabel('Conversion Rate')
plt.title('Conversion Rate by Service Score')
plt.grid(True)
plt.show()

from sklearn.linear_model import LogisticRegression

# 单变量逻辑回归
X = filtered_data[['sales_service_score']]
y = filtered_data['is_trade']

model = LogisticRegression(class_weight='balanced')
model.fit(X, y)

# 计算每增加1分的几率比(Odds Ratio)
import numpy as np
odds_ratio = np.exp(model.coef_[0][0])  # 取指数得到OR
print(f"销售服务评分每增加1分，转化几率变为原来的 {odds_ratio:.2f} 倍")

# 分段统计转化率
hourly_conversion = filtered_data.groupby('hour')['is_trade'].agg(
    count='count',
    conversions='sum',
    conversion_rate='mean'
).reset_index()

# 逻辑回归量化影响
X_hour = filtered_data[['hour']]
y = filtered_data['is_trade']
hour_model = LogisticRegression(class_weight='balanced')
hour_model.fit(X_hour, y)
hour_effect = np.exp(hour_model.coef_[0][0])  # 每增加1小时的几率比

print(f"每多浏览1小时，转化几率变为原来的 {hour_effect:.2f} 倍")

plt.figure(figsize=(10, 4))

# 实际转化率曲线
plt.subplot(1, 2, 1)
plt.plot(hourly_conversion['hour'], hourly_conversion['conversion_rate'],
         marker='o', color='#4E79A7')
plt.xlabel('Browsing duration (hours)')
plt.ylabel('Conversion rate')
plt.title('Hourly conversion rate change')
plt.grid(True, alpha=0.3)

# 逻辑回归预测曲线
plt.subplot(1, 2, 2)
hour_range = np.arange(0, 25).reshape(-1, 1)
plt.plot(hour_range, hour_model.predict_proba(hour_range)[:, 1],
         color='#E15759', lw=2)
plt.xlabel('Browsing duration (hours)')
plt.ylabel('Predicting transformation probability')
plt.title(f'Every +1 hour: {hour_effect:.2f}xchance')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


# 按评分分箱（每0.5分一个区间）
filtered_data['delivery_bin'] = pd.cut(
    filtered_data['collect_delivery_score'],
    bins=np.arange(0, 5.5, 0.5)
)

delivery_conversion = filtered_data.groupby('delivery_bin')['is_trade'].agg(
    count='count',
    conversion_rate='mean'
).reset_index()

# 逻辑回归
X_delivery = filtered_data[['collect_delivery_score']]
delivery_model = LogisticRegression(class_weight='balanced')
delivery_model.fit(X_delivery, y)
delivery_effect = np.exp(delivery_model.coef_[0][0])

print(f"快递评分每+1分，转化几率变为 {delivery_effect:.2f} 倍")

plt.figure(figsize=(12, 4))

# 分箱转化率柱状图
plt.subplot(1, 2, 1)
delivery_conversion['mid'] = delivery_conversion['delivery_bin'].apply(lambda x: x.mid)
plt.bar(delivery_conversion['mid'], delivery_conversion['conversion_rate'],
        width=0.4, color='#59A14F', alpha=0.7)
plt.xlabel('Courier service score')
plt.ylabel('Conversion rate')
plt.title('Actual conversion rate of different scoring intervals')

# 边际效应曲线
plt.subplot(1, 2, 2)
score_range = np.linspace(0, 5, 100).reshape(-1, 1)
plt.plot(score_range, delivery_model.predict_proba(score_range)[:, 1],
         color='#F28E2B', lw=2)
plt.xlabel('Courier service score')
plt.ylabel('Predicting transformation probability')
plt.title(f'Every +1 point: {delivery_effect:.2f}xchance')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

