In [None]:
####01_data_splitting#######
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from pathlib import Path 
CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR.parent
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"
# --- 配置参数 ---
file_path = DATA_DIR / "JM.xlsx"
target_column = 'target'
test_set_size = 0.2  # 20% 作为最终测试集
random_seed = 42     # 设置随机种子以保证结果可复现


# --- 从文件路径中提取目录，用于保存输出文件 ---
dev_set_filename = DATA_DIR / "development_set.xlsx"
test_set_filename = DATA_DIR /  "final_test_set.xlsx"

# --- 1. 加载数据 ---
try:
    original_data_df = pd.read_excel(file_path)
    print(f"原始数据集加载成功，总样本数: {len(original_data_df)}")
    if target_column not in original_data_df.columns:
        raise ValueError(f"错误: 目标变量 '{target_column}' 不在提供的Excel文件中。请检查列名。")
except FileNotFoundError:
    print(f"错误: 文件未找到 {file_path}")
    exit()
except Exception as e:
    print(f"加载Excel文件时发生错误: {e}")
    exit()

# --- 2. 二分类直接用 target 做分层，无需分箱 ---
stratify_column_data = original_data_df[target_column]


# --- 3. 执行数据划分 ---
print("正在基于目标变量进行分层随机抽样...")
development_df, final_test_df = train_test_split(
    original_data_df,
    test_size=test_set_size,
    stratify=stratify_column_data,
    random_state=random_seed
)
# --- 5. 打印划分结果信息 ---
print("\n--- 数据划分结果 ---")
print(f"开发集 (Development Set) 样本数: {len(development_df)} ({len(development_df)/len(original_data_df)*100:.2f}%)")
print(f"最终测试集 (Final Hold-Out Test Set) 样本数: {len(final_test_df)} ({len(final_test_df)/len(original_data_df)*100:.2f}%)")

print("\n目标变量在各集合中的分布（类别计数）:")
print("原始数据集:")
print(original_data_df[target_column].value_counts())
print("\n开发集:")
print(development_df[target_column].value_counts())
print("\n最终测试集:")
print(final_test_df[target_column].value_counts())

# --- 6. 保存划分好的数据集到Excel文件 ---
try:
    development_df.to_excel(dev_set_filename, index=False)
    print(f"\n开发集已成功保存到: {dev_set_filename}")
    final_test_df.to_excel(test_set_filename, index=False)
    print(f"最终测试集已成功保存到: {test_set_filename}")
except Exception as e:
    print(f"\n保存文件时发生错误: {e}")

print("\n代码执行完毕。")

In [None]:
#####