In [None]:
# =============================================================================
# 导入所有需要的包
# =============================================================================
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

In [2]:
# =============================================================================
# 数据读取函数 (当第一行没有列标签时使用)
# =============================================================================
def read_data_no_header(file_path, column_names, separator=','):
    """
    读取第一行没有列标签的数据文件。
    
    参数:
    file_path (str): 数据文件的完整路径。
    column_names (list): 一个包含所有列名的列表。
    separator (str): 数据文件中的分隔符，默认为逗号。
    
    返回:
    pandas.DataFrame: 读取到的数据。
    """
    try:
        df = pd.read_csv(file_path, header=None, names=column_names, sep=separator)
        print(f"文件 '{os.path.basename(file_path)}' 读取成功，共 {df.shape[0]} 行, {df.shape[1]} 列。")
        return df
    except FileNotFoundError:
        print(f"错误：文件未找到于路径 '{file_path}'")
        return None

In [3]:
# =============================================================================
# 数据读取函数 (当第一行有列标签时使用)
# =============================================================================
def read_data_with_header(file_path, separator=','):
    """
    读取第一行包含列标签的数据文件。
    
    参数:
    file_path (str): 数据文件的完整路径。
    separator (str): 数据文件中的分隔符，默认为逗号。
    
    返回:
    pandas.DataFrame: 读取到的数据。
    """
    try:
        df = pd.read_csv(file_path, sep=separator)
        print(f"文件 '{os.path.basename(file_path)}' 读取成功，共 {df.shape[0]} 行, {df.shape[1]} 列。")
        return df
    except FileNotFoundError:
        print(f"错误：文件未找到于路径 '{file_path}'")
        return None

In [4]:
# =============================================================================
# 根据列的索引(index)删除一列或多列
# =============================================================================
def drop_columns_by_index(df, indices_to_drop):
    """
    根据列的索引位置删除DataFrame中的一列或多列。
    
    参数:
    df (pandas.DataFrame): 需要操作的DataFrame。
    indices_to_drop (list): 一个包含需要删除的列索引的列表, e.g., [0, 5]。
    
    返回:
    pandas.DataFrame:删除了指定列之后的数据。
    """
    # 获取需要删除的列名
    columns_to_drop = df.columns[indices_to_drop]
    df_dropped = df.drop(columns=columns_to_drop, axis=1)
    print(f"成功删除索引为 {indices_to_drop} 的列: {list(columns_to_drop)}")
    return df_dropped

In [5]:
# =============================================================================
# 常见的缺失值处理函数
# =============================================================================
def handle_missing_values(df, missing_value_marker='?', strategy='drop_row'):
    """
    处理DataFrame中的缺失值。
    
    参数:
    df (pandas.DataFrame): 需要操作的DataFrame。
    missing_value_marker (str): 文件中代表缺失值的符号, UCI数据集中常见为 '?'。
    strategy (str): 处理策略。
                     'drop_row': 删除任何包含缺失值的行 (默认)。
                     'fill_mode': 用该列的众数填充缺失值 (适用于类别数据)。
    
    返回:
    pandas.DataFrame: 处理了缺失值之后的数据。
    """
    # 步骤1: 将缺失值标记统一替换为numpy的NaN格式
    df.replace(missing_value_marker, np.nan, inplace=True)
    
    if strategy == 'drop_row':
        original_rows = df.shape[0]
        df.dropna(inplace=True)
        print(f"执行'drop_row'策略，删除了 {original_rows - df.shape[0]} 行。")
    
    elif strategy == 'fill_mode':
        for column in df.columns:
            if df[column].isnull().any():
                mode_value = df[column].mode()[0]
                df[column].fillna(mode_value, inplace=True)
                print(f"执行'fill_mode'策略，使用值 '{mode_value}' 填充了列 '{column}' 的缺失值。")
    
    # 重置索引
    df.reset_index(drop=True, inplace=True)
    return df

In [6]:
# =============================================================================
# 将分类/文本数据转换为数值型数据
# =============================================================================
def encode_categorical_columns(df, columns_to_encode):
    """
    使用Label Encoding将指定的分类列转换为数值。
    
    参数:
    df (pandas.DataFrame): 需要操作的DataFrame。
    columns_to_encode (list): 需要转换的列名列表。
    
    返回:
    pandas.DataFrame: 转换后的数据。
    """
    le = LabelEncoder()
    df_encoded = df.copy()
    for col in columns_to_encode:
        df_encoded[col] = le.fit_transform(df_encoded[col])
        print(f"成功将列 '{col}' 进行了Label Encoding。")
    return df_encoded

In [7]:
# =============================================================================
# 用于显示DataFrame基本信息的辅助函数
# =============================================================================
def display_df_info(df, df_name="DataFrame"):
    """显示一个DataFrame的头部、信息和描述性统计。"""
    print("="*30)
    print(f" {df_name} 的基本信息")
    print("="*30)
    print("\n--- 前5行数据 (Head) ---")
    display(df.head())
    print("\n--- 数据信息 (Info) ---")
    df.info()
    print("\n--- 描述性统计 (Describe) ---")
    display(df.describe(include='all'))
    print("\n\n")

In [14]:
data_name = "zoo"
DATA_DIR = "../data/" + data_name + "/"

In [16]:
# =============================================================================
# 主程序 - 在这里设置你的数据目录并调用上面的函数
# =============================================================================

# =============================================================================
# 示例1: 处理 data 数据集 (第一行没有列标签)
# =============================================================================
print("---------- 开始处理 " + data_name + " 数据集 ----------")
# 定义文件名和列名
data_filename = data_name + ".data"
data_filepath = os.path.join(DATA_DIR, data_filename)
data_column_names = [
    "animal_name", "hair", "feathers", "eggs", "milk", "airborne", 
    "aquatic", "predator", "toothed", "backbone", "breathes", 
    "venomous", "fins", "legs", "tail", "domestic", "catsize", "type"
]

# 调用函数读取数据
data_df = read_data_no_header(data_filepath, data_column_names)

if data_df is not None:
    # 显示原始数据信息
    display_df_info(data_df, "原始 data 数据")
    
    # 调用函数根据索引删除第一列 'animal_name'
    # 'animal_name'是唯一标识符，通常不参与模型训练
    data_df_processed = drop_columns_by_index(data_df, [0])
    
    # 显示处理后的数据信息
    display_df_info(data_df_processed, "删除animal_name列后的 data 数据")
    # 保存处理后的数据为 data_processed.csv
    processed_filepath = os.path.join(DATA_DIR, data_name + "_processed.csv")
    data_df_processed.to_csv(processed_filepath, index=False)

---------- 开始处理 zoo 数据集 ----------
文件 'zoo.data' 读取成功，共 101 行, 18 列。
 原始 data 数据 的基本信息

--- 前5行数据 (Head) ---


Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1



--- 数据信息 (Info) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  101 non-null    object
 1   hair         101 non-null    int64 
 2   feathers     101 non-null    int64 
 3   eggs         101 non-null    int64 
 4   milk         101 non-null    int64 
 5   airborne     101 non-null    int64 
 6   aquatic      101 non-null    int64 
 7   predator     101 non-null    int64 
 8   toothed      101 non-null    int64 
 9   backbone     101 non-null    int64 
 10  breathes     101 non-null    int64 
 11  venomous     101 non-null    int64 
 12  fins         101 non-null    int64 
 13  legs         101 non-null    int64 
 14  tail         101 non-null    int64 
 15  domestic     101 non-null    int64 
 16  catsize      101 non-null    int64 
 17  type         101 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 14.3+ KB

-

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
count,101,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
unique,100,,,,,,,,,,,,,,,,,
top,frog,,,,,,,,,,,,,,,,,
freq,2,,,,,,,,,,,,,,,,,
mean,,0.425743,0.19802,0.584158,0.405941,0.237624,0.356436,0.554455,0.60396,0.821782,0.792079,0.079208,0.168317,2.841584,0.742574,0.128713,0.435644,2.831683
std,,0.496921,0.400495,0.495325,0.493522,0.42775,0.481335,0.499505,0.491512,0.384605,0.407844,0.27141,0.376013,2.033385,0.439397,0.336552,0.498314,2.102709
min,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0
50%,,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,2.0
75%,,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,1.0,4.0





成功删除索引为 [0] 的列: ['animal_name']
 删除animal_name列后的 data 数据 的基本信息

--- 前5行数据 (Head) ---


Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1



--- 数据信息 (Info) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   hair      101 non-null    int64
 1   feathers  101 non-null    int64
 2   eggs      101 non-null    int64
 3   milk      101 non-null    int64
 4   airborne  101 non-null    int64
 5   aquatic   101 non-null    int64
 6   predator  101 non-null    int64
 7   toothed   101 non-null    int64
 8   backbone  101 non-null    int64
 9   breathes  101 non-null    int64
 10  venomous  101 non-null    int64
 11  fins      101 non-null    int64
 12  legs      101 non-null    int64
 13  tail      101 non-null    int64
 14  domestic  101 non-null    int64
 15  catsize   101 non-null    int64
 16  type      101 non-null    int64
dtypes: int64(17)
memory usage: 13.5 KB

--- 描述性统计 (Describe) ---


Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.425743,0.19802,0.584158,0.405941,0.237624,0.356436,0.554455,0.60396,0.821782,0.792079,0.079208,0.168317,2.841584,0.742574,0.128713,0.435644,2.831683
std,0.496921,0.400495,0.495325,0.493522,0.42775,0.481335,0.499505,0.491512,0.384605,0.407844,0.27141,0.376013,2.033385,0.439397,0.336552,0.498314,2.102709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,2.0
75%,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,1.0,4.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,7.0







In [None]:
# =============================================================================
# 示例2: 处理 csv 数据集 (假设它有列标签，且需要编码)
# (这是一个演示，你需要有.csv文件和正确的列名才能运行)
# =============================================================================
print("\n\n---------- 开始处理 " + data_name + " 数据集 ----------")
data_filename = data_name + ".csv"
data_filepath = os.path.join(DATA_DIR, data_filename)

# 假设.csv文件本身没有列标签，我们先用无标签方式读取
data_column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
data_df = read_data_no_header(data_filepath, data_column_names)

if data_df is not None:
    display_df_info(data_df, "原始csv数据")
    
    # csv数据集的所有列都是类别数据，都需要进行编码
    data_df_processed = encode_categorical_columns(data_df, data_column_names)

    display_df_info(data_df_processed, "编码后的csv数据")