In [7]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
import os

def load_adult_local(train_path="adult.data", test_path="adult.test"):
    columns = [
        "age", "workclass", "fnlwgt", "education", "education-num",
        "marital-status", "occupation", "relationship", "race", "sex",
        "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
    ]

    # 讀取訓練與測試
    df_train = pd.read_csv(train_path, header=None,
                           names=columns, sep=',\s*', engine='python')
    df_test = pd.read_csv(test_path, header=0, names=columns,
                          sep=',\s*', engine='python', skiprows=1)

    # 清理資料（去除空白與 .）
    df_train = df_train.applymap(
        lambda x: x.strip() if isinstance(x, str) else x)
    df_test = df_test.applymap(
        lambda x: x.strip() if isinstance(x, str) else x)
    df_test['income'] = df_test['income'].str.replace(
        '.', '', regex=False).str.strip()

    return df_train, df_test

# ---------- 2. 前處理 ----------
def preprocess_data(df_train, df_test, output_dir="processed_data"):
    # 創建輸出目錄
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 複製資料避免修改原始資料
    df_train = df_train.copy()
    df_test = df_test.copy()
    
    # 1. 缺失值處理 - 刪除包含 '?' 的列
    print(f"原始訓練集大小: {len(df_train)}")
    print(f"原始測試集大小: {len(df_test)}")
    
    # 刪除包含 '?' 的列
    df_train = df_train.replace('?', np.nan)
    df_test = df_test.replace('?', np.nan)
    
    df_train = df_train.dropna()
    df_test = df_test.dropna()
    
    print(f"刪除缺失值後訓練集大小: {len(df_train)}")
    print(f"刪除缺失值後測試集大小: {len(df_test)}")
    
    # 2. 特徵移除
    # 移除冗餘特徵 education 和無關特徵 fnlwgt
    columns_to_drop = ['education', 'fnlwgt']
    df_train = df_train.drop(columns=columns_to_drop)
    df_test = df_test.drop(columns=columns_to_drop)
    
    # 3. 特徵工程 - 合併 native-country
    def merge_country(country):
        if country == 'United-States':
            return 'United-States'
        else:
            return 'Other'
    
    df_train['native-country'] = df_train['native-country'].apply(merge_country)
    df_test['native-country'] = df_test['native-country'].apply(merge_country)
    
    # 4. 特徵工程 - 根據Kaggle做法創建新特徵
    # 創建資本收益特徵
    df_train['capital'] = df_train['capital-gain'] - df_train['capital-loss']
    df_test['capital'] = df_test['capital-gain'] - df_test['capital-loss']
    
    # 創建工作時數分組
    def hours_group(hours):
        if hours < 40:
            return 'less_than_40'
        elif hours == 40:
            return 'exactly_40'
        else:
            return 'more_than_40'
    
    df_train['hours-group'] = df_train['hours-per-week'].apply(hours_group)
    df_test['hours-group'] = df_test['hours-per-week'].apply(hours_group)
    
    # 年齡分組
    def age_group(age):
        if age < 30:
            return 'young'
        elif age < 50:
            return 'middle'
        else:
            return 'senior'
    
    df_train['age-group'] = df_train['age'].apply(age_group)
    df_test['age-group'] = df_test['age'].apply(age_group)
    
    # 準備特徵和標籤
    X_train = df_train.drop(columns=['income'])
    y_train = df_train['income']
    X_test = df_test.drop(columns=['income'])
    y_test = df_test['income']

    # 更新數值和類別特徵列表
    numeric_cols = ['age', 'education-num', 'capital-gain', 'capital-loss', 
                   'hours-per-week', 'capital']
    categorical_cols = ['workclass', 'marital-status', 'occupation', 'relationship', 
                       'race', 'sex', 'native-country', 'hours-group', 'age-group']

    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median'))
    ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    X_train_p = preprocessor.fit_transform(X_train)
    X_test_p = preprocessor.transform(X_test)

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc = le.transform(y_test)
    
    # 取得特徵名稱
    feature_names = (numeric_cols + 
                    list(preprocessor.named_transformers_['cat']
                        .named_steps['onehot']
                        .get_feature_names_out(categorical_cols)))
    
    # 創建最終的訓練集和測試集 DataFrame
    df_train_final = pd.DataFrame(X_train_p, columns=feature_names)
    df_test_final = pd.DataFrame(X_test_p, columns=feature_names)
    
    # 添加目標變數
    df_train_final['income'] = y_train_enc
    df_test_final['income'] = y_test_enc
    
    # 輸出 Python 決策樹專用檔案（改名為 adult_train.csv 和 adult_test.csv）
    df_train_final.to_csv(f"{output_dir}/adult_train.csv", index=False)
    df_test_final.to_csv(f"{output_dir}/adult_test.csv", index=False)
    
    print(f"Python 決策樹訓練集: {output_dir}/adult_train.csv")
    print(f"Python 決策樹測試集: {output_dir}/adult_test.csv")
    print(f"訓練集形狀: {df_train_final.shape}")
    print(f"測試集形狀: {df_test_final.shape}")
    
    # 輸出 R C5.0 專用檔案（同樣使用 One-Hot 編碼，但分開特徵和標籤）
    print("\n生成 R C5.0 專用檔案...")
    
    # 分離特徵和標籤
    X_train_r = df_train_final.drop(columns=['income'])
    y_train_r = df_train_final['income']
    X_test_r = df_test_final.drop(columns=['income'])
    y_test_r = df_test_final['income']
    
    # 輸出 R 專用檔案
    X_train_r.to_csv(f"{output_dir}/X_train.csv", index=False)
    y_train_r.to_csv(f"{output_dir}/Y_train.csv", index=False, header=['income'])
    X_test_r.to_csv(f"{output_dir}/X_test.csv", index=False)
    y_test_r.to_csv(f"{output_dir}/Y_test.csv", index=False, header=['income'])
    
    print(f"R C5.0 特徵訓練集: {output_dir}/X_train.csv")
    print(f"R C5.0 標籤訓練集: {output_dir}/Y_train.csv")
    print(f"R C5.0 特徵測試集: {output_dir}/X_test.csv")
    print(f"R C5.0 標籤測試集: {output_dir}/Y_test.csv")

    return X_train_p, X_test_p, y_train_enc, y_test_enc, le, preprocessor, X_test, y_test, df_train_final, df_test_final

# 執行處理
df_train, df_test = load_adult_local()
X_train_p, X_test_p, y_train_enc, y_test_enc, le, preprocessor, X_test, y_test, df_train_final, df_test_final = preprocess_data(df_train, df_test)

print("\n" + "="*50)
print("CSV 檔案輸出完成！")
print("="*50)
print("\n生成的檔案：")
print("1. adult_train.csv - Python 決策樹訓練集 (特徵 + 標籤)")
print("2. adult_test.csv  - Python 決策樹測試集 (特徵 + 標籤)")
print("3. X_train.csv     - R C5.0 特徵訓練集 (One-Hot 編碼)")
print("4. Y_train.csv     - R C5.0 標籤訓練集 (0/1 編碼)")
print("5. X_test.csv      - R C5.0 特徵測試集 (One-Hot 編碼)")
print("6. Y_test.csv      - R C5.0 標籤測試集 (0/1 編碼)")


  names=columns, sep=',\s*', engine='python')
  sep=',\s*', engine='python', skiprows=1)
  df_train = df_train.applymap(
  df_test = df_test.applymap(


原始訓練集大小: 32561
原始測試集大小: 16280
刪除缺失值後訓練集大小: 30162
刪除缺失值後測試集大小: 15059
Python 決策樹訓練集: processed_data/adult_train.csv
Python 決策樹測試集: processed_data/adult_test.csv
訓練集形狀: (30162, 56)
測試集形狀: (15059, 56)

生成 R C5.0 專用檔案...
R C5.0 特徵訓練集: processed_data/X_train.csv
R C5.0 標籤訓練集: processed_data/Y_train.csv
R C5.0 特徵測試集: processed_data/X_test.csv
R C5.0 標籤測試集: processed_data/Y_test.csv

CSV 檔案輸出完成！

生成的檔案：
1. adult_train.csv - Python 決策樹訓練集 (特徵 + 標籤)
2. adult_test.csv  - Python 決策樹測試集 (特徵 + 標籤)
3. X_train.csv     - R C5.0 特徵訓練集 (One-Hot 編碼)
4. Y_train.csv     - R C5.0 標籤訓練集 (0/1 編碼)
5. X_test.csv      - R C5.0 特徵測試集 (One-Hot 編碼)
6. Y_test.csv      - R C5.0 標籤測試集 (0/1 編碼)
