## import

In [None]:
import os
import sys
import pickle
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.metrics import roc_auc_score, roc_curve, auc, log_loss, f1_score, accuracy_score, recall_score, precision_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE
import pyarrow.parquet as pq
import oss2
import io


### 数据读取（已存储到相应路径）

In [None]:
data_path = 'z_data_new2/data_merged101'
train_data = pd.read_pickle(os.path.join(data_path,'22_05_merged_101.pkl'))
test_data = pd.read_pickle(os.path.join(data_path,'data_raw_20250106_py_merged_91.pkl'))

#去除LLM ICI columns
# LLM
# group_1 = ['category_first', 'category_second', 'audience', 'scenario']

# ICI
# group_2 = ['click_sum', 'buy_sum', 'convert_rate', 'ici_push_total', 'gmv_sum', 'commission_sum']

# 删除LLM
# train_data = train_data.drop(columns=group_1, errors='ignore')
# test_data = test_data.drop(columns=group_1, errors='ignore')

# 删除ICI
# train_data = train_data.drop(columns=group_2, errors='ignore')
# test_data = test_data.drop(columns=group_2, errors='ignore')


encoder_path = 'z_model/encoder_data'
model_path = 'z_model/model_data'
print(encoder_path)
print(model_path)


# ICI 是否选取要调整此处

In [None]:
# drop_column = ['click_sum', 'buy_sum', 'ici_push_total', 'gmv_sum', 'commission_sum']

# train_data.drop(columns=drop_column,inplace=True)
# test_data.drop(columns=drop_column,inplace=True)

# 正负例是否变化要调整此处

In [None]:
test_data = test_data[(test_data['lable_x'] != 0) | (test_data['weekly_order_cnt'] < 4)]
train_data = train_data[(train_data['lable_x'] != 0) | (train_data['weekly_order_cnt'] < 4)]

print(f"调整后的测试数据集大小: {test_data.shape}")
print(f"调整后的训练数据集大小: {train_data.shape}")

In [None]:
def replace_negative_one_with_unknown(data, columns):
    for column in columns:
        if column in data.columns:
            data[column] = data[column].replace(-1, 'Unknown')
    return data


# 要处理的列
columns_to_process = ['shop_title', 'brand_name', 'audience_1', 'audience_2','brand_real_1','brand_real_2','scenario_1','scenario_2','feature1','feature2','feature3','feature4','feature5','feature6']


train_data = replace_negative_one_with_unknown(train_data, columns_to_process)
test_data = replace_negative_one_with_unknown(test_data, columns_to_process)

In [None]:
list1 = train_data.columns
list2 = test_data.columns
unique_to_list1 = set(list1) - set(list2)
unique_to_list2 = set(list2) - set(list1)
unique_to_list1 = list(unique_to_list1)
unique_to_list2 = list(unique_to_list2)

print("在 train_data 中但不在 test_data 中的列名:", unique_to_list1)
print("在 test_data 中但不在 train_data 中的列名:", unique_to_list2)

### 创建pkl

### 对原始数据整数编码

In [None]:
fit = True#fit 为 True，则训练新的编码器并保存；如果 fit 为 False，则从存储中加载已保存的编码器
# fit = False
if fit:
    label_encoder_level_one_category_name = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_category_name = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_category_category_first = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_category_category_second = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_category_audience = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    # label_encoder_category_scenario = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_brand_name = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_shop_title = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_feature1 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_feature2 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_feature3 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_feature4 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_feature5 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_feature6 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_scenario_1 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_scenario_2 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_brand_real_1 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_brand_real_2 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_audience_1 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    label_encoder_audience_2 = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)



    # label_encoder_p = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    label_encoder_level_one_category_name.fit(train_data["level_one_category_name"].value_counts().reset_index().dropna()["level_one_category_name"].to_numpy().reshape(-1,1))
    label_encoder_category_name.fit(train_data["category_name"].value_counts().reset_index().dropna()["category_name"].to_numpy().reshape(-1,1))
    label_encoder_category_category_first.fit(train_data["category_first"].value_counts().reset_index().dropna()["category_first"].to_numpy().reshape(-1,1))
    label_encoder_category_category_second.fit(train_data["category_second"].value_counts().reset_index().dropna()["category_second"].to_numpy().reshape(-1,1))
    # label_encoder_category_audience.fit(train_data["audience"].value_counts().reset_index().dropna()["audience"].to_numpy().reshape(-1,1))
    # label_encoder_category_scenario.fit(train_data["scenario"].value_counts().reset_index().dropna()["scenario"].to_numpy().reshape(-1,1))
    label_encoder_brand_name.fit(train_data["brand_name"].value_counts().reset_index().dropna()["brand_name"].to_numpy().reshape(-1,1))
    label_encoder_shop_title.fit(train_data["shop_title"].value_counts().reset_index().dropna()["shop_title"].to_numpy().reshape(-1,1))
    label_encoder_feature1.fit(train_data["feature1"].value_counts().reset_index().dropna()["feature1"].to_numpy().reshape(-1,1))
    label_encoder_feature2.fit(train_data["feature2"].value_counts().reset_index().dropna()["feature2"].to_numpy().reshape(-1,1))
    label_encoder_feature3.fit(train_data["feature3"].value_counts().reset_index().dropna()["feature3"].to_numpy().reshape(-1,1))
    label_encoder_feature4.fit(train_data["feature4"].value_counts().reset_index().dropna()["feature4"].to_numpy().reshape(-1,1))
    label_encoder_feature5.fit(train_data["feature5"].value_counts().reset_index().dropna()["feature5"].to_numpy().reshape(-1,1))
    label_encoder_feature6.fit(train_data["feature6"].value_counts().reset_index().dropna()["feature6"].to_numpy().reshape(-1,1))
    label_encoder_scenario_1.fit(train_data["scenario_1"].value_counts().reset_index().dropna()["scenario_1"].to_numpy().reshape(-1,1))
    label_encoder_scenario_2.fit(train_data["scenario_2"].value_counts().reset_index().dropna()["scenario_2"].to_numpy().reshape(-1,1))
    label_encoder_brand_real_1.fit(train_data["brand_real_1"].value_counts().reset_index().dropna()["brand_real_1"].to_numpy().reshape(-1,1))
    label_encoder_brand_real_2.fit(train_data["brand_real_2"].value_counts().reset_index().dropna()["brand_real_2"].to_numpy().reshape(-1,1))
    label_encoder_audience_1.fit(train_data["audience_1"].value_counts().reset_index().dropna()["audience_1"].to_numpy().reshape(-1,1))
    label_encoder_audience_2.fit(train_data["audience_2"].value_counts().reset_index().dropna()["audience_2"].to_numpy().reshape(-1,1))


    # label_encoder_p.fit(train_data.dropna(subset=["audience"])["audience"].to_numpy().reshape(-1,1))
    pickle.dump(label_encoder_level_one_category_name, open(os.path.join(encoder_path, "22_28_label_encoder_level_one_category_name.pkl"), "wb"))
    pickle.dump(label_encoder_category_name, open(os.path.join(encoder_path, "22_28_label_encoder_category_name.pkl"), "wb"))
    pickle.dump(label_encoder_category_category_first, open(os.path.join(encoder_path, "22_28_label_encoder_category_first.pkl"), "wb"))
    pickle.dump(label_encoder_category_category_second, open(os.path.join(encoder_path, "22_28_label_encoder_category_second.pkl"), "wb"))
    pickle.dump(label_encoder_category_audience, open(os.path.join(encoder_path, "22_28_label_encoder_audience.pkl"), "wb"))
    # pickle.dump(label_encoder_category_scenario, open(os.path.join(encoder_path, "22_28_label_encoder_scenario.pkl"), "wb"))
    # pickle.dump(label_encoder_p, open(os.path.join(encoder_path, "label_encoder2_p.pkl/"), "wb"))
    pickle.dump(label_encoder_brand_name, open(os.path.join(encoder_path, "22_28_label_encoder_brand_name.pkl"), "wb"))
    pickle.dump(label_encoder_shop_title, open(os.path.join(encoder_path, "22_28_llabel_encoder_shop_title.pkl"), "wb"))
    pickle.dump(label_encoder_feature1, open(os.path.join(encoder_path, "22_28_label_encoder_feature1.pkl"), "wb"))
    pickle.dump(label_encoder_feature2, open(os.path.join(encoder_path, "22_28_label_encoder_feature2.pkl"), "wb"))
    pickle.dump(label_encoder_feature3, open(os.path.join(encoder_path, "22_28_label_encoder_feature3.pkl"), "wb"))
    pickle.dump(label_encoder_feature4, open(os.path.join(encoder_path, "22_28_label_encoder_feature4.pkl"), "wb"))
    pickle.dump(label_encoder_feature5, open(os.path.join(encoder_path, "22_28_label_encoder_feature5.pkl"), "wb"))
    pickle.dump(label_encoder_feature6, open(os.path.join(encoder_path, "22_28_label_encoder_feature6.pkl"), "wb"))
    pickle.dump(label_encoder_scenario_1, open(os.path.join(encoder_path, "22_28_label_encoder_scenario_1.pkl"), "wb"))
    pickle.dump(label_encoder_scenario_2, open(os.path.join(encoder_path, "22_28_label_encoder_scenario_2.pkl"), "wb"))
    pickle.dump(label_encoder_brand_real_1, open(os.path.join(encoder_path, "22_28_label_encoder_brand_real_1.pkl"), "wb"))
    pickle.dump(label_encoder_brand_real_2, open(os.path.join(encoder_path, "22_28_label_encoder_brand_real_2.pkl"), "wb"))   
    pickle.dump(label_encoder_audience_1, open(os.path.join(encoder_path, "22_28_label_encoder_audience_1.pkl"), "wb"))
    pickle.dump(label_encoder_audience_2, open(os.path.join(encoder_path, "22_28_label_encoder_audience_2.pkl"), "wb"))  
    print("1")
else:
    label_encoder_level_one_category_name = pickle.load(open(os.path.join(encoder_path, "22_28_label_encoder_level_one_category_name.pkl"), "rb"))
    label_encoder_category_name = pickle.load(open(os.path.join(encoder_path, "22_28_label_encoder_category_name.pkl"), "rb"))
    label_encoder_category_category_first = pickle.load(open(os.path.join(encoder_path, "22_28_label_encoder_category_first.pkl"), "rb"))
    label_encoder_category_category_second = pickle.load(open(os.path.join(encoder_path, "22_28_label_encoder_category_second.pkl"), "rb"))
    label_encoder_category_audience = pickle.load(open(os.path.join(encoder_path, "22_28_label_encoder_audience.pkl"), "rb"))
    label_encoder_category_scenario = pickle.load(open(os.path.join(encoder_path, "22_28_label_encoder_scenario.pkl"), "rb"))
    # label_encoder_p = pickle.load(open(os.path.join(encoder_path, "label_encoder2_p.pkl"), "rb"))
    print("2")

In [None]:
#打印出每一列的列名以及该列中不同值的数量
columns = train_data.columns
for col in columns:
    print(f"列名: {col}, 不同值的数量: {train_data[col].nunique()}")


In [None]:
# 在进行转换前，先处理缺失值
train_data["level_one_category_name"] = train_data["level_one_category_name"].fillna('Unknown')  # 用 'Unknown' 填充 NaN 值
train_data["category_name"] = train_data["category_name"].fillna('Unknown')  # 用 'Unknown' 填充 NaN 值
train_data["category_first"] = train_data["category_first"].fillna('Unknown')  # 用 'Unknown' 填充 NaN 值
train_data["category_second"] = train_data["category_second"].fillna('Unknown')  # 用 'Unknown' 填充 NaN 值
# train_data["audience"] = train_data["audience"].fillna('Unknown')  # 用 'Unknown' 填充 NaN 值
# train_data["scenario"] = train_data["scenario"].fillna('Unknown')  # 用 'Unknown' 填充 NaN 值


test_data["level_one_category_name"] = test_data["level_one_category_name"].fillna('Unknown')
test_data["category_name"] = test_data["category_name"].fillna('Unknown')
test_data["category_first"] = test_data["category_first"].fillna('Unknown')
test_data["category_second"] = test_data["category_second"].fillna('Unknown')
# test_data["audience"] = test_data["audience"].fillna('Unknown')
# test_data["scenario"] = test_data["scenario"].fillna('Unknown')

In [None]:
print(sum(train_data["level_one_category_name"] == 'Unknown'))
print(sum(train_data["category_name"] == 'Unknown'))

In [None]:
# 对 train_data 进行编码处理
train_data["level_one_category_name"] = label_encoder_level_one_category_name.transform(train_data["level_one_category_name"].to_numpy().reshape(-1, 1))
train_data["category_name"] = label_encoder_category_name.transform(train_data["category_name"].to_numpy().reshape(-1, 1))
train_data["category_first"] = label_encoder_level_one_category_name.transform(train_data["category_first"].to_numpy().reshape(-1, 1))
train_data["category_second"] = label_encoder_category_name.transform(train_data["category_second"].to_numpy().reshape(-1, 1))
# train_data["audience"] = label_encoder_level_one_category_name.transform(train_data["audience"].to_numpy().reshape(-1, 1))
# train_data["scenario"] = label_encoder_category_name.transform(train_data["scenario"].to_numpy().reshape(-1, 1))
train_data["brand_name"] = label_encoder_brand_name.transform(train_data["brand_name"].to_numpy().reshape(-1, 1))
train_data["shop_title"] = label_encoder_shop_title.transform(train_data["shop_title"].to_numpy().reshape(-1, 1))
train_data["feature1"] = label_encoder_feature1.transform(train_data["feature1"].to_numpy().reshape(-1, 1))
train_data["feature2"] = label_encoder_feature2.transform(train_data["feature2"].to_numpy().reshape(-1, 1))
train_data["feature3"] = label_encoder_feature3.transform(train_data["feature3"].to_numpy().reshape(-1, 1))
train_data["feature4"] = label_encoder_feature4.transform(train_data["feature4"].to_numpy().reshape(-1, 1))
train_data["feature5"] = label_encoder_feature5.transform(train_data["feature5"].to_numpy().reshape(-1, 1))
train_data["feature6"] = label_encoder_feature6.transform(train_data["feature6"].to_numpy().reshape(-1, 1))
train_data["scenario_1"] = label_encoder_scenario_1.transform(train_data["scenario_1"].to_numpy().reshape(-1, 1))
train_data["scenario_2"] = label_encoder_scenario_2.transform(train_data["scenario_2"].to_numpy().reshape(-1, 1))
train_data["brand_real_1"] = label_encoder_brand_real_1.transform(train_data["brand_real_1"].to_numpy().reshape(-1, 1))
train_data["brand_real_2"] = label_encoder_brand_real_2.transform(train_data["brand_real_2"].to_numpy().reshape(-1, 1))
train_data["audience_1"] = label_encoder_audience_1.transform(train_data["audience_1"].to_numpy().reshape(-1, 1))
train_data["audience_2"] = label_encoder_audience_2.transform(train_data["audience_2"].to_numpy().reshape(-1, 1))

# label_encoder_feature6.fit(train_data["feature6"].value_counts().reset_index().dropna()["feature6"].to_numpy().reshape(-1,1))
# label_encoder_scenario_1.fit(train_data["scenario_1"].value_counts().reset_index().dropna()["scenario_1"].to_numpy().reshape(-1,1))
# label_encoder_scenario_2.fit(train_data["scenario_2"].value_counts().reset_index().dropna()["scenario_2"].to_numpy().reshape(-1,1))
# label_encoder_brand_real_1.fit(train_data["brand_real_1"].value_counts().reset_index().dropna()["brand_real_1"].to_numpy().reshape(-1,1))
# label_encoder_brand_real_2.fit(train_data["brand_real_2"].value_counts().reset_index().dropna()["brand_real_2"].to_numpy().reshape(-1,1))
# label_encoder_audience_1.fit(train_data["audience_1"].value_counts().reset_index().dropna()["audience_1"].to_numpy().reshape(-1,1))
# label_encoder_audience_2.fit(train_data["audience_2"].value_counts().reset_index().dropna()["audience_2"].to_numpy().reshape(-1,1))


# 定义训练目标
target = train_data['lable_x']

# 打印训练数据正负例分布
print(f"全部的训练数据正负例分布: {train_data['lable_x'].value_counts()}")

# 不再进行划分，直接使用全部的 train_data 进行训练
X_train = train_data.drop(columns=['lable_x'])
y_train = target

# 对 test_data 进行编码处理
test_data["level_one_category_name"] = label_encoder_level_one_category_name.transform(test_data["level_one_category_name"].to_numpy().reshape(-1, 1))
test_data["category_name"] = label_encoder_category_name.transform(test_data["category_name"].to_numpy().reshape(-1, 1))
test_data["category_first"] = label_encoder_level_one_category_name.transform(test_data["category_first"].to_numpy().reshape(-1, 1))
test_data["category_second"] = label_encoder_category_name.transform(test_data["category_second"].to_numpy().reshape(-1, 1))
# test_data["audience"] = label_encoder_level_one_category_name.transform(test_data["audience"].to_numpy().reshape(-1, 1))
# test_data["scenario"] = label_encoder_category_name.transform(test_data["scenario"].to_numpy().reshape(-1, 1))
test_data["brand_name"] = label_encoder_brand_name.transform(test_data["brand_name"].to_numpy().reshape(-1, 1))
test_data["shop_title"] = label_encoder_shop_title.transform(test_data["shop_title"].to_numpy().reshape(-1, 1))
test_data["feature1"] = label_encoder_feature1.transform(test_data["feature1"].to_numpy().reshape(-1, 1))
test_data["feature2"] = label_encoder_feature2.transform(test_data["feature2"].to_numpy().reshape(-1, 1))
test_data["feature3"] = label_encoder_feature3.transform(test_data["feature3"].to_numpy().reshape(-1, 1))
test_data["feature4"] = label_encoder_feature4.transform(test_data["feature4"].to_numpy().reshape(-1, 1))
test_data["feature5"] = label_encoder_feature5.transform(test_data["feature5"].to_numpy().reshape(-1, 1))
test_data["feature6"] = label_encoder_feature6.transform(test_data["feature6"].to_numpy().reshape(-1, 1))
test_data["scenario_1"] = label_encoder_scenario_1.transform(test_data["scenario_1"].to_numpy().reshape(-1, 1))
test_data["scenario_2"] = label_encoder_scenario_2.transform(test_data["scenario_2"].to_numpy().reshape(-1, 1))
test_data["brand_real_1"] = label_encoder_brand_real_1.transform(test_data["brand_real_1"].to_numpy().reshape(-1, 1))
test_data["brand_real_2"] = label_encoder_brand_real_2.transform(test_data["brand_real_2"].to_numpy().reshape(-1, 1))
test_data["audience_1"] = label_encoder_audience_1.transform(test_data["audience_1"].to_numpy().reshape(-1, 1))
test_data["audience_2"] = label_encoder_audience_2.transform(test_data["audience_2"].to_numpy().reshape(-1, 1))
################


# 打印测试数据正负例分布
print(f"全部的额外测试数据正负例分布: {test_data['lable_x'].value_counts()}")

# 不再对 test_data 进行划分，直接作为测试集
X_test = test_data.drop(columns=['lable_x'])
y_test = test_data['lable_x']

# 打印各个数据集的正负例分布
print(f"训练数据正负例分布: {y_train.value_counts()}")
print(f"测试数据正负例分布: {y_test.value_counts()}")

# 输出训练集和测试集的大小
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")



In [None]:
print(X_train.shape)

In [None]:
tmp_drop_columns = ['title_x','pricejson_update_time','lable_x','click_num_x','buy_num_x','feature','obj_scenario','time_scenario','dep_scenario','subj_scenario','brand_quality','brand_real','category',
                    'title','item_id','ranked_categories.p_diffjson','ranked_categories.discountjson','ranked_categories.pricejson','ranked_categories.annual_voljson','item_title','ici_card','ici'] 
tmp_drop_columns

In [None]:

x_temp = X_train.drop(columns=[x for x in tmp_drop_columns if x in X_train.columns])



# print(shape(X_train))

print(x_temp.columns)

In [None]:
print(y_test.shape)

In [None]:
import lightgbm as lgb
from datetime import datetime
import pickle
import os

params = {
    "task": "train",
    "boosting_type": "gbdt",
    "num_class": 1,
    "objective": "binary",
    "metric": "auc",
    "num_leaves": 31,
    "max_depth": 10,
    "boost_from_average": True,
    "num_threads": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "min_data_in_leaf": 100,
    "learning_rate": 0.02,
    "max_bin": 5000,
    "is_unbalance": True,
    'lambda_l1': 0.1, 
    'lambda_l2': 0.1, 
}


current_time = datetime.now().strftime("%Y%m%d_%H:%M")
X_train_resampled, y_train_resampled = X_train, y_train
lgb_train = lgb.Dataset(X_train_resampled.drop(columns=[x for x in tmp_drop_columns if x in X_train_resampled.columns]), y_train_resampled, params=params)
X_test_resampled, y_test_resampled = X_test, y_test
lgb_test = lgb.Dataset(X_test_resampled.drop(columns=[x for x in tmp_drop_columns if x in X_test_resampled.columns]), y_test_resampled, reference=lgb_train)

# 开始训练模型
evals_result = {}
lgb_model = lgb.train(params,
                      lgb_train,
                      num_boost_round=2000,
                      valid_sets=lgb_test,  # 这里使用的是 test 数据集来进行模型评估
                      callbacks=[lgb.early_stopping(100), lgb.record_evaluation(evals_result)])


lgb.plot_metric(booster=evals_result, metric='auc')
lgb.plot_importance(lgb_model, max_num_features=20)
current_time = datetime.now().strftime("%Y%m%d_%H:%M")
pickle.dump(lgb_model, open(os.path.join(model_path, f"22_28_model_{current_time}.pkl"), "wb"))


In [None]:
# see = X_val_resampled.drop(columns=[x for x in tmp_drop_columns if x in X_val_resampled.columns])

In [None]:
# tree_index = 1  
# lgb.plot_tree(lgb_model, tree_index=tree_index,figsize=(40, 20))
# plt.show()


In [None]:
importance = lgb_model.feature_importance()
feature_names = lgb_model.feature_name()

# 打印每个特征的重要性
feature_importance = []
for feature, score in zip(feature_names, importance):
    feature_importance.append({"feature": feature, "score": score})
feature_importance = pd.DataFrame(feature_importance)

In [None]:
sorted_feature_importance = feature_importance.sort_values(by='score', ascending=False)


In [None]:
sorted_feature_importance

In [None]:
# test_preds = lgb_model.predict(X_test.drop(columns=tmp_drop_columns))
test_preds = lgb_model.predict(X_test.drop(columns=tmp_drop_columns, errors='ignore'))

test_res = (test_preds > 0.5).astype(int)

aucs = roc_auc_score(y_test.values, test_preds)
# logloss = log_loss(y_test.values, test_preds, eps=1e-12)
logloss = log_loss(y_test.values, test_preds)

precision = precision_score(y_test.values, test_res)
accuracy = accuracy_score(y_test.values, test_res)
recall = recall_score(y_test.values, test_res)
f1 = f1_score(y_test.values, test_res)

print({"auc": aucs, "logloss": logloss})
print({"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy})

# display(test_x.head())
# display(train_x.head())

pos = np.sum(y_test)
print(len(y_test),pos,pos/len(y_test),len(y_test)-pos)
pos = np.sum(y_train)
print(len(y_train),pos,pos/len(y_train),len(y_train)-pos)

In [None]:
for thred in [0.05, 0.1,0.2,0.25,0.3, 0.35,0.4,0.45, 0.5,0.55, 0.6, 0.65, 0.7,0.75, 0.8, 0.85, 0.9, 0.95]:
    print(f"==============================thred={thred}==================================")
    print(classification_report(y_test, (test_preds > thred).astype(int), target_names=['负例', '正例'], digits=4))

In [None]:
# 将 predictions 和 target 合并成一个 DataFrame
P_T = pd.DataFrame({
    'Predictions': test_preds,
    'Target': target
})


TT = P_T[P_T['Target' ]== 1]
TF = P_T[P_T['Target' ]== 0]

TT = TT['Predictions']
TF = TF['Predictions']



import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.figure(figsize=(8, 6))
sns.kdeplot(TT, shade=True, color="b", label='TT')
sns.kdeplot(TF, shade=True, color="r", label='TF')
plt.title("Predictions vs. Target Probability Density", fontsize=16)
plt.xlabel("Probability / Label", fontsize=12)
plt.ylabel("Density", fontsize=12)

plt.legend()
plt.show()
