In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import random
import json
from datetime import datetime
import os

In [2]:
# 设置随机种子
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)

# 加载CSV数据
def load_data_from_csv(csv_path):
    """从CSV文件加载数据"""
    df = pd.read_csv(csv_path)
    
    print(f"Loaded data from: {csv_path}")
    print(f"Total samples: {len(df)}")
    
    # 去除filename列
    if 'filename' in df.columns:
        df = df.drop('filename', axis=1)
    
    # 分离特征和标签
    if 'label' not in df.columns:
        raise ValueError("'label' column not found in CSV file")
    
    y = df['label'].values
    X = df.drop('label', axis=1).values
    feature_names = df.drop('label', axis=1).columns.tolist()
    
    print(f"Features: {len(feature_names)}")
    print(f"Positive samples: {np.sum(y == 1)} ({np.sum(y == 1)/len(y)*100:.2f}%)")
    print(f"Negative samples: {np.sum(y == 0)} ({np.sum(y == 0)/len(y)*100:.2f}%)")
    
    return X, y, feature_names

set_seed(42)

In [3]:
# 加载无标签数据
def load_unlabeled_data(csv_path, feature_names):
    """从CSV文件加载无标签数据"""
    df = pd.read_csv(csv_path)
    
    print(f"\nLoaded unlabeled data from: {csv_path}")
    print(f"Total samples: {len(df)}")
    
    # 保存文件名（如果存在）
    filenames = df['filename'].values if 'filename' in df.columns else None
    
    # 去除filename和label列（如果存在）
    cols_to_drop = []
    if 'filename' in df.columns:
        cols_to_drop.append('filename')
    if 'label' in df.columns:
        cols_to_drop.append('label')
    
    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)
    
    # 确保特征顺序与训练数据一致
    if set(df.columns.tolist()) != set(feature_names):
        print("Warning: Feature names don't match exactly. Reordering columns...")
        df = df[feature_names]
    
    X = df.values
    print(f"Features: {X.shape[1]}")
    
    return X, filenames

In [4]:
# XGBoost模型训练函数
def train_xgboost_full(X_train, y_train, seed=51):
    """使用全部数据训练XGBoost模型"""
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 9,
        'learning_rate': 0.042,
        'subsample': 0.8,
        'colsample_bytree': 0.6,
        'scale_pos_weight': 2.23,
        'min_child_weight': 1,
        'gamma': 0.2,
        'seed': seed
    }
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    print("\nTraining XGBoost model on full dataset...")
    model = xgb.train(params, dtrain, num_boost_round=100, verbose_eval=10)
    
    return model
# 模型预测函数
def predict_xgboost(model, X_test):
    """XGBoost预测"""
    dtest = xgb.DMatrix(X_test)
    y_pred_proba = model.predict(dtest)
    y_pred = (y_pred_proba >= 0.5).astype(int)
    return y_pred, y_pred_proba

# 保存模型
def save_model(model, model_path, metadata=None):
    """保存XGBoost模型和元数据"""
    # 创建保存目录
    os.makedirs(os.path.dirname(model_path) if os.path.dirname(model_path) else '.', exist_ok=True)
    
    # 保存模型
    model.save_model(model_path)
    print(f"\nModel saved to: {model_path}")
    
    # 保存元数据
    if metadata:
        metadata_path = model_path.replace('.json', '_metadata.json').replace('.model', '_metadata.json')
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=4)
        print(f"Metadata saved to: {metadata_path}")

# 保存预测结果
def save_predictions(filenames, y_pred, y_pred_proba, output_path):
    """保存预测结果到CSV"""
    # 创建DataFrame
    results_df = pd.DataFrame({
        'filename': filenames if filenames is not None else range(len(y_pred)),
        'predicted_label': y_pred,
        'probability': y_pred_proba
    })
    
    # 保存到CSV
    os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
    results_df.to_csv(output_path, index=False)
    print(f"\nPredictions saved to: {output_path}")
    
    # 打印统计信息
    print(f"Total predictions: {len(results_df)}")
    print(f"Predicted positive: {np.sum(y_pred == 1)} ({np.sum(y_pred == 1)/len(y_pred)*100:.2f}%)")
    print(f"Predicted negative: {np.sum(y_pred == 0)} ({np.sum(y_pred == 0)/len(y_pred)*100:.2f}%)")
    print(f"Probability range: [{y_pred_proba.min():.4f}, {y_pred_proba.max():.4f}]")
    print(f"Mean probability: {y_pred_proba.mean():.4f}")

In [5]:
# 主程序
def main():
    # 设置随机种子
    set_seed(42)
    
    # 配置路径
    TRAIN_CSV = 'data/dataset/features_final_38.csv'
    UNLABELED_CSV = 'data/dataset/features_38_unlabeled.csv'  # 修改为你的无标签数据路径
    MODEL_PATH = 'models/xgboost_full_model.json'
    OUTPUT_CSV = 'results/predictions.csv'
    
    # 1. 加载训练数据
    print("=" * 60)
    print("Step 1: Loading training data")
    print("=" * 60)
    X_train, y_train, feature_names = load_data_from_csv(TRAIN_CSV)
    
    # 2. 训练模型
    print("\n" + "=" * 60)
    print("Step 2: Training model on full dataset")
    print("=" * 60)
    model = train_xgboost_full(X_train, y_train, seed=51)
    
    # 3. 保存模型
    print("\n" + "=" * 60)
    print("Step 3: Saving model")
    print("=" * 60)
    metadata = {
        'train_samples': len(X_train),
        'n_features': len(feature_names),
        'feature_names': feature_names,
        'positive_samples': int(np.sum(y_train == 1)),
        'negative_samples': int(np.sum(y_train == 0)),
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'model_params': {
            'max_depth': 9,
            'learning_rate': 0.042,
            'subsample': 0.8,
            'colsample_bytree': 0.6,
            'scale_pos_weight': 2.23,
            'min_child_weight': 1,
            'gamma': 0.2,
            'num_boost_round': 100
        }
    }
    save_model(model, MODEL_PATH, metadata)
    
    # 4. 加载无标签数据
    print("\n" + "=" * 60)
    print("Step 4: Loading unlabeled data")
    print("=" * 60)
    X_unlabeled, filenames = load_unlabeled_data(UNLABELED_CSV, feature_names)
    
    # 5. 进行预测
    print("\n" + "=" * 60)
    print("Step 5: Making predictions")
    print("=" * 60)
    y_pred, y_pred_proba = predict_xgboost(model, X_unlabeled)
    
    # 6. 保存预测结果
    print("\n" + "=" * 60)
    print("Step 6: Saving predictions")
    print("=" * 60)
    save_predictions(filenames, y_pred, y_pred_proba, OUTPUT_CSV)
    
    print("\n" + "=" * 60)
    print("Pipeline completed successfully!")
    print("=" * 60)

In [None]:
if __name__ == "__main__":
    main()