In [45]:
import os, sys
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

seed = 66
# 原数据文件夹
data_dir = './data'
# 原始horse-colic数据
data_horse_colic = data_dir + '/horse-colic.data'
data_horse_colic_names = data_dir + '/horse-colic.names'
# 实验结果文件夹
output_dir = './output'
os.makedirs(output_dir, exist_ok=True)

In [74]:
# 读取data_horse_colic文件（无列名）
horse_df = pd.read_csv(data_horse_colic, delim_whitespace=True, header=None)
# 替换缺失值标记'?'为NaN
horse_df.replace('?', np.nan, inplace=True)

# 添加列名
column_names = [
    'surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse', 'respiratory_rate',
    'temp_extremities', 'peripheral_pulse', 'mucous_membranes', 'capillary_refill_time',
    'pain', 'peristalsis', 'abdominal_distension', 'nasogastric_tube', 'nasogastric_reflux',
    'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen', 'packed_cell_volume',
    'total_protein', 'abdominocentesis_appearance', 'abdomcentesis_total_protein',
    'outcome', 'surgical_lesion', 'lesion_site', 'lesion_type', 'lesion_subtype', 'cp_data'
]
horse_df.columns = column_names
horse_df.head()
# 查看数据缺失情况
horse_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   surgery                      299 non-null    object
 1   age                          300 non-null    int64 
 2   hospital_number              300 non-null    int64 
 3   rectal_temp                  240 non-null    object
 4   pulse                        276 non-null    object
 5   respiratory_rate             242 non-null    object
 6   temp_extremities             244 non-null    object
 7   peripheral_pulse             231 non-null    object
 8   mucous_membranes             253 non-null    object
 9   capillary_refill_time        268 non-null    object
 10  pain                         245 non-null    object
 11  peristalsis                  256 non-null    object
 12  abdominal_distension         244 non-null    object
 13  nasogastric_tube             196 no

In [75]:
horse_data_cleaned = horse_df

# 修改错误数据
horse_data_cleaned["age"] = horse_data_cleaned["age"].replace(to_replace=9, value=2)
# 查看处理后的数据信息
horse_data_cleaned.info()
horse_data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   surgery                      299 non-null    object
 1   age                          300 non-null    int64 
 2   hospital_number              300 non-null    int64 
 3   rectal_temp                  240 non-null    object
 4   pulse                        276 non-null    object
 5   respiratory_rate             242 non-null    object
 6   temp_extremities             244 non-null    object
 7   peripheral_pulse             231 non-null    object
 8   mucous_membranes             253 non-null    object
 9   capillary_refill_time        268 non-null    object
 10  pain                         245 non-null    object
 11  peristalsis                  256 non-null    object
 12  abdominal_distension         244 non-null    object
 13  nasogastric_tube             196 no

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_extremities,peripheral_pulse,mucous_membranes,capillary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_site,lesion_type,lesion_subtype,cp_data
0,2,1,530101,38.5,66,28,3.0,3.0,,2,...,45,8.4,,,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,,,4.0,1,...,50,85.0,2.0,2.0,3,2,2208,0,0,2
2,2,1,530334,38.3,40,24,1.0,1.0,3.0,1,...,33,6.7,,,1,2,0,0,0,1
3,1,2,5290409,39.1,164,84,4.0,1.0,6.0,2,...,48,7.2,3.0,5.3,2,1,2208,0,0,1
4,2,1,530255,37.3,104,35,,,6.0,2,...,74,7.4,,,2,2,4300,0,0,2


In [84]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler


class CustomColumnTransformer:
    def __init__(self, transformations, remainder="passthrough", names_mapper=lambda x: x):
        self.transformations = transformations
        self.remainder = remainder
        self.names_mapper = names_mapper

    def transform(self, X):
        ct = ColumnTransformer(self.transformations, remainder=self.remainder).set_output(transform="pandas")
        result = ct.fit_transform(X)
        result.columns = [self.names_mapper(name[name.index('__') + 2:]) for name in ct.get_feature_names_out()]
        return result


# 使用平均值填充
meanColumns = ['respiratory_rate']
# 使用众数填充
frequentColumns = ['surgery', 'outcome', 'mucous_membranes', 'capillary_refill_time', 'pain', 'peristalsis',
                   'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen',
                   'abdominocentesis_appearance', 'abdominal_distension']
# 使用knn填充
knnColumns = ['rectal_temp', 'pulse', 'temp_extremities', 'peripheral_pulse', 'nasogastric_reflux_ph',
              'packed_cell_volume', 'total_protein', 'abdomcentesis_total_protein']

fillNaTransformer = CustomColumnTransformer([
    ('fillWithMean', SimpleImputer(strategy='mean'), meanColumns),
    ('fillWithFrequent', SimpleImputer(strategy="most_frequent"), frequentColumns),
    ('fillWithKnn', KNNImputer(n_neighbors=3), knnColumns)
])

horse_data_cleaned = fillNaTransformer.transform(horse_data_cleaned)
horse_data_cleaned.info()
horse_data_cleaned

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respiratory_rate             300 non-null    float64
 1   surgery                      300 non-null    object 
 2   outcome                      300 non-null    object 
 3   mucous_membranes             300 non-null    object 
 4   capillary_refill_time        300 non-null    object 
 5   pain                         300 non-null    object 
 6   peristalsis                  300 non-null    object 
 7   nasogastric_tube             300 non-null    object 
 8   nasogastric_reflux           300 non-null    object 
 9   rectal_exam_feces            300 non-null    object 
 10  abdomen                      300 non-null    object 
 11  abdominocentesis_appearance  300 non-null    object 
 12  abdominal_distension         300 non-null    object 
 13  rectal_temp         

Unnamed: 0,respiratory_rate,surgery,outcome,mucous_membranes,capillary_refill_time,pain,peristalsis,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,...,packed_cell_volume,total_protein,abdomcentesis_total_protein,age,hospital_number,surgical_lesion,lesion_site,lesion_type,lesion_subtype,cp_data
0,28.0,2,2,1,2,5,4,2,1,3,...,45.0,8.400000,2.0,1,530101,2,11300,0,0,2
1,20.0,1,3,4,1,3,4,2,1,4,...,50.0,85.000000,2.0,1,534817,2,2208,0,0,2
2,24.0,2,1,3,1,3,3,2,1,1,...,33.0,6.700000,2.0,1,530334,2,0,0,0,1
3,84.0,1,2,6,2,2,4,1,2,3,...,48.0,7.200000,5.3,2,5290409,1,2208,0,0,1
4,35.0,2,2,6,2,3,3,2,1,4,...,74.0,7.400000,2.0,1,530255,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,70.0,1,3,4,2,2,4,2,1,4,...,55.0,65.000000,2.0,1,533886,2,3205,0,0,2
296,24.0,2,3,4,2,4,3,3,1,4,...,44.0,24.456929,3.3,1,527702,1,2208,0,0,1
297,30.0,1,2,4,1,4,4,2,1,3,...,60.0,6.800000,2.0,1,529386,1,3205,0,0,2
298,24.0,1,1,3,1,3,3,3,1,4,...,50.0,6.000000,3.4,1,530612,1,2208,0,0,1


In [86]:
# knn 归一化操作
scale_columns = ['rectal_temp', 'pulse', 'temp_extremities', 'peripheral_pulse', 'nasogastric_reflux_ph',
                 'packed_cell_volume', 'total_protein', 'abdomcentesis_total_protein']

horse_data_cleaned[scale_columns] = StandardScaler().fit_transform(horse_data_cleaned[scale_columns])
horse_data_cleaned

Unnamed: 0,respiratory_rate,surgery,outcome,mucous_membranes,capillary_refill_time,pain,peristalsis,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,...,packed_cell_volume,total_protein,abdomcentesis_total_protein,age,hospital_number,surgical_lesion,lesion_site,lesion_type,lesion_subtype,cp_data
0,28.0,2,2,1,2,5,4,2,1,3,...,-0.131032,-0.620647,-0.279538,1,530101,2,11300,0,0,2
1,20.0,1,3,4,1,3,4,2,1,4,...,0.374803,2.340165,-0.279538,1,534817,2,2208,0,0,2
2,24.0,2,1,3,1,3,3,2,1,1,...,-1.345035,-0.686357,-0.279538,1,530334,2,0,0,0,1
3,84.0,1,2,6,2,2,4,1,2,3,...,0.172469,-0.667030,2.381445,2,5290409,1,2208,0,0,1
4,35.0,2,2,6,2,3,3,2,1,4,...,2.802810,-0.659300,-0.279538,1,530255,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,70.0,1,3,4,2,2,4,2,1,4,...,0.880638,1.567107,-0.279538,1,533886,2,3205,0,0,2
296,24.0,2,3,4,2,4,3,3,1,4,...,-0.232199,0.000000,0.768728,1,527702,1,2208,0,0,1
297,30.0,1,2,4,1,4,4,2,1,3,...,1.386473,-0.682491,-0.279538,1,529386,1,3205,0,0,2
298,24.0,1,1,3,1,3,3,3,1,4,...,0.374803,-0.713414,0.849364,1,530612,1,2208,0,0,1
