In [1]:
import pandas as pd
# 读取CSV文件
df = pd.read_csv("data/train.csv")

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

 

# 步骤1: 数据预处理
# 分割心跳信号序列为数值列表
df['heartbeat_signals'] = df['heartbeat_signals'].apply(lambda x: [float(i) for i in x.split(',')])

# 将列表转换为DataFrame的列
max_length = max(df['heartbeat_signals'].apply(len))  # 找到最长序列的长度
signals_df = pd.DataFrame(df['heartbeat_signals'].tolist()).fillna(0)  # 用0填充缺失值
df = pd.concat([df, signals_df], axis=1)

# 删除原始的heartbeat_signals列
df = df.drop('heartbeat_signals', axis=1)

In [3]:
from imblearn.over_sampling import SMOTE
# 步骤3: 模型训练
X = df.drop(['id', 'label'], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 使用SMOTE进行过采样处理类别不平衡，只对训练集进行处理
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import numpy as np
import xgboost as xgb

kf = KFold(n_splits=5, shuffle=True, random_state=42)

abs_sum_scores = []  # 存储每一折的评分

for train_index, test_index in kf.split(X):
    X_resampled, X_test = X.iloc[train_index], X.iloc[test_index]
    y_resampled, y_test = y[train_index], y[test_index]
    # 使用找到的最佳参数设置模型
    best_params = {
    'colsample_bytree': 0.7692681476866446,
    'learning_rate': 0.0823076398078035,
    'max_depth': 6,
    'min_child_weight': 7,
    'n_estimators': 527,
    'subsample': 0.848553073033381,
    'use_label_encoder': False,
    'eval_metric': 'mlogloss'
   }

    # 初始化XGBoost模型
    model = xgb.XGBClassifier(**best_params)
    
    model.fit(X_resampled, y_resampled)
    
    # 预测概率
    y_pred_proba = model.predict_proba(X_test)
    
    # 计算abs-sum
    # 首先，我们需要将y_test转换为one-hot编码形式，以匹配y_pred_proba的格式
    y_test_one_hot = np.zeros((y_test.size, y_pred_proba.shape[1]))
    y_test_one_hot[np.arange(y_test.size), y_test.astype(int)] = 1
    
    abs_sum = np.abs(y_test_one_hot - y_pred_proba).sum() / y_test.size
    abs_sum_scores.append(abs_sum)

# 计算平均abs-sum分数
average_abs_sum = np.mean(abs_sum_scores)
print(f"Average Abs-Sum Score: {average_abs_sum}")


Average Abs-Sum Score: 0.04418875475363426


In [8]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import numpy as np
import xgboost as xgb

kf = KFold(n_splits=5, shuffle=True, random_state=42)

abs_sum_scores = []  # 存储每一折的评分

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 使用找到的最佳参数设置模型
    best_params = {
    'colsample_bytree': 0.7692681476866446,
    'learning_rate': 0.0823076398078035,
    'max_depth': 6,
    'min_child_weight': 7,
    'n_estimators': 527,
    'subsample': 0.848553073033381,
    'use_label_encoder': False,
    'eval_metric': 'mlogloss'
   }

    # 初始化XGBoost模型
    model = xgb.XGBClassifier(**best_params)
    
    model.fit(X_train, y_train)
    
    # 预测概率
    y_pred_proba = model.predict_proba(X_test)
    
    # 计算abs-sum
    # 首先，我们需要将y_test转换为one-hot编码形式，以匹配y_pred_proba的格式
    y_test_one_hot = np.zeros((y_test.size, y_pred_proba.shape[1]))
    y_test_one_hot[np.arange(y_test.size), y_test.astype(int)] = 1
    
    abs_sum = np.abs(y_test_one_hot - y_pred_proba).sum() / y_test.size
    abs_sum_scores.append(abs_sum)

# 计算平均abs-sum分数
average_abs_sum = np.mean(abs_sum_scores)
print(f"Average Abs-Sum Score: {average_abs_sum}")

 

KeyboardInterrupt: 

In [6]:
pip install lightgbm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting lightgbm
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ba/11/cb8b67f3cbdca05b59a032bb57963d4fe8c8d18c3870f30bed005b7f174d/lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m200.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
# 创建LightGBM数据集
train_data = lgb.Dataset(X_resampled, label=y_resampled)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 使用默认参数
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_resampled)),
    'metric': 'multi_logloss',
    'verbosity': -1
}

# 训练模型
gbm = lgb.train(params, train_data, num_boost_round=100, valid_sets=[test_data])

# 预测测试集
y_pred_proba = gbm.predict(X_test, num_iteration=gbm.best_iteration)

# 计算测试集的对数损失
log_loss_score = log_loss(y_test, y_pred_proba)
print(f"Test Log Loss: {log_loss_score}")

# 首先，将实际标签转换为one-hot编码形式
num_classes = np.unique(y_resampled).shape[0]  # 假设所有类别都出现在y_resampled中
y_test_one_hot = np.zeros((y_test.shape[0], num_classes))
y_test_one_hot[np.arange(y_test.shape[0]), y_test.astype(int)] = 1

# 计算预测概率与实际标签之间的abs-sum
abs_sum = np.sum(np.abs(y_pred_proba - y_test_one_hot)) / y_test.shape[0]

print(f"Average Abs-Sum: {abs_sum}")


Test Log Loss: 0.02038390054920139
Average Abs-Sum: 0.035310493539996254
