### This script splits the ARM dataset into training and testing subsets for fine-tuning. 
Specifically, it allocates 50% of the data to the training set and the remaining 50% to the test set.
The script ensures that at least one row is included in the training set, even if the dataset is small.
The resulting subsets are saved as separate CSV files for further use in fine-tuning machine learning models.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# spilt in order

In [None]:
# 所有要拆分的 setpoint 列表
set_points = [0.1, 0.2, 0.4, 0.8, 1.0]

# 遍历每个 setpoint，按 50%:50% 拆分并保存
for sp in set_points:
    # 构造输入文件路径（根据你的实际路径自行修改）
    in_path = f'/data/keeling/a/xx24/e/proj_ml/code_ml_surfactant_ccn/data/setpoint_{sp}.csv'
    
    # 读取原始观测数据
    df = pd.read_csv(in_path)
    
    # Calculate 50% of the total dataset size
    percent = int(len(df)*0.5)

    # Ensure at least one row is included in the training set
    percent = max(1, percent)

    # Use slicing to create the training set (first `percent` rows)
    train = df[:percent]
    # Use slicing to create the test set (remaining rows)
    test = df[percent:]

    # 构造输出文件路径
    train_path = f'/data/keeling/a/xx24/e/proj_ml/code_ml_surfactant_ccn/data/setpoint_{sp}_train.csv'
    test_path  = f'/data/keeling/a/xx24/e/proj_ml/code_ml_surfactant_ccn/data/setpoint_{sp}_test.csv'
    
    # 保存到 CSV（不保留行索引）
    train.to_csv(train_path, index=False)
    test.to_csv(test_path, index=False)
    
    print(f'Finished splitting setpoint_{sp}.csv → train ({len(train)} rows)  /  test ({len(test)} rows)')


Finished splitting setpoint_0.1.csv → train (890 rows)  /  test (891 rows)
Finished splitting setpoint_0.2.csv → train (890 rows)  /  test (891 rows)
Finished splitting setpoint_0.4.csv → train (818 rows)  /  test (818 rows)
Finished splitting setpoint_0.8.csv → train (885 rows)  /  test (886 rows)
Finished splitting setpoint_1.0.csv → train (681 rows)  /  test (681 rows)


# random spilt

In [2]:
# 所有要拆分的 setpoint 列表
set_points = [0.1, 0.2, 0.4, 0.8, 1.0]

# 遍历每个 setpoint，按 50%:50% 拆分并保存
for sp in set_points:
    # 构造输入文件路径（根据你的实际路径自行修改）
    in_path = f'/data/keeling/a/xx24/e/proj_ml/code_ml_surfactant_ccn/data/setpoint_{sp}.csv'
    
    # 读取原始观测数据
    df = pd.read_csv(in_path)
    
    # 50%:50% 随机拆分（random_state 保证可复现）
    train, test = train_test_split(df, test_size=0.5, random_state=42)
    
    # 构造输出文件路径
    train_path = f'/data/keeling/a/xx24/e/proj_ml/code_ml_surfactant_ccn/data/setpoint_{sp}_train.csv'
    test_path  = f'/data/keeling/a/xx24/e/proj_ml/code_ml_surfactant_ccn/data/setpoint_{sp}_test.csv'
    
    # 保存到 CSV（不保留行索引）
    train.to_csv(train_path, index=False)
    test.to_csv(test_path, index=False)
    
    print(f'Finished splitting setpoint_{sp}.csv → train ({len(train)} rows)  /  test ({len(test)} rows)')

Finished splitting setpoint_0.1.csv → train (890 rows)  /  test (891 rows)
Finished splitting setpoint_0.2.csv → train (890 rows)  /  test (891 rows)
Finished splitting setpoint_0.4.csv → train (818 rows)  /  test (818 rows)
Finished splitting setpoint_0.8.csv → train (885 rows)  /  test (886 rows)
Finished splitting setpoint_1.0.csv → train (681 rows)  /  test (681 rows)
