### 当成多分类问题解

In [15]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 示例数据
df  = pd.read_csv('D:\github_repo_forked\lifetime_value\kuaidian\clean\sliding_window_data_expanded.csv')

# 过滤掉样本数少于2的类别
min_samples = 5  # 每个类别最少样本数
class_counts = df['next_chapter'].value_counts()
valid_classes = class_counts[class_counts >= min_samples].index
df = df[df['next_chapter'].isin(valid_classes)]

# 编码分类特征
categorical_features = ['os_version', 'device_brand', 'loc_city_id']
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df[categorical_features])

# 将编码后的特征转换为DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# 删除原始的分类特征列
df.drop(columns=categorical_features, inplace=True)

# 将编码后的特征与原始数据合并
df = pd.concat([df, encoded_df], axis=1)

# 分离特征和目标变量

X = df.drop(columns=['next_chapter','user_id'])
y = df['next_chapter']

# 1. 重新映射类别
unique_chapters = sorted(df['next_chapter'].unique())
chapter_to_idx = {chapter: idx for idx, chapter in enumerate(unique_chapters)}
idx_to_chapter = {idx: chapter for idx, chapter in enumerate(unique_chapters)}

# 3. 验证映射是否连续
y = df['next_chapter'].map(chapter_to_idx)
print("Mapped unique values:", sorted(y.unique()))

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y ,random_state=42)


Mapped unique values: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int64(67), np.int64(68), np.int64(69), np.int6

### train

In [None]:

# 创建并训练模型
model = XGBClassifier(objective='multi:softmax', num_class=len(y.unique()), use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)


### eval

In [None]:

# 进行预测
y_pred = model.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))

### grid search

In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}
model = XGBClassifier(objective='multi:softmax')

# 创建 GridSearchCV 对象
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')

# 在训练集上进行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和最佳得分
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best score: 0.6395368072787427


In [17]:
optimal_model = XGBClassifier(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    objective='multi:softmax', 
    num_class=len(y.unique()), 
    use_label_encoder=False, 
    eval_metric='mlogloss'
)
optimal_model.fit(X_train, y_train)
y_pred = optimal_model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



### 结果

-  混淆矩阵热力图

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# 假设 y_test 是真实标签，y_pred 是预测标签
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


- 分类报告可视化

In [None]:
from yellowbrick.classifier import ClassificationReport
from sklearn.ensemble import RandomForestClassifier

# 创建分类器
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 创建可视化对象
visualizer = ClassificationReport(model, support=True)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()


-  ROC曲线（针对多分类）

In [None]:
import scikitplot as skplt
from sklearn.preprocessing import label_binarize

# 二值化标签
y_test_bin = label_binarize(y_test, classes=range(len(np.unique(y))))
y_probas = model.predict_proba(X_test)

# 绘制ROC曲线
skplt.metrics.plot_roc(y_test_bin, y_probas)
plt.show()


In [5]:
class_counts = df['next_chapter'].value_counts()
print(f"类别总数: {len(class_counts)}")
print("\n样本数分布:")
print(class_counts.describe())
single_sample_classes = class_counts[class_counts == 1]

print(f"单样本类别数量: {len(single_sample_classes)}")

类别总数: 391

样本数分布:
count    391.000000
mean      19.406650
std       57.336263
min        1.000000
25%        3.000000
50%        5.000000
75%       11.000000
max      547.000000
Name: count, dtype: float64
单样本类别数量: 59


In [8]:
min_samples = 2  # 每个类别最少样本数
valid_classes = class_counts[class_counts >= min_samples].index
filtered_df = df[df['next_chapter'].isin(valid_classes)]

print(f"\n过滤后的类别数: {len(valid_classes)}")
print(f"过滤后的样本数: {len(filtered_df)}")


过滤后的类别数: 332
过滤后的样本数: 7529


### 当成回归问题解

In [13]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

df  = pd.read_csv('D:\github_repo_forked\lifetime_value\kuaidian\clean\sliding_window_data_expanded.csv')


# 编码分类特征
categorical_features = ['os_version', 'device_brand', 'loc_city_id']
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df[categorical_features])

# 将编码后的特征转换为DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# 删除原始的分类特征列
df.drop(columns=categorical_features, inplace=True)

# 将编码后的特征与原始数据合并
df = pd.concat([df, encoded_df], axis=1)

# 分离特征和目标变量
X = df.drop(columns=['next_chapter','user_id'])
y = df['next_chapter']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建并训练模型
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model.fit(X_train, y_train)

# 进行预测
y_pred = model.predict(X_test)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Mean Absolute Error (MAE): {mae:.2f}')

Mean Squared Error (MSE): 3594.53
Mean Absolute Error (MAE): 27.06
