In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer


In [2]:
file_path = './Bigdata/unscaled_data.xlsx'
df = pd.read_excel(file_path)

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('PATIENT_ID',axis=1,inplace=True)

In [4]:
binary_cols = df.columns[df.nunique() == 2].tolist()
numeric_cols = df.columns[df.nunique()!=2].tolist()
# 用KNN填充数据 k=5
imputer = KNNImputer(n_neighbors=5)  # 假设选择 5 个最近邻
df_complete_knn = imputer.fit_transform(df)

col = df.columns.tolist()
df = pd.DataFrame(df_complete_knn, columns=col)
for col in binary_cols:
    # 判断值不等于0或1的情况，然后按比例修改值
    df[col] = df[col].apply(lambda x: 1 if x > 0.3 else (0 if x < 0.3 else x))

In [None]:
target_column_name=['癌', '癌前病变', '良性疾病','健康或非结肠疾病']

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

X = df_complete_knn.drop(target_column_name, axis=1) 
y = df_complete_knn["癌"] 
logistic_regression = LogisticRegression(max_iter=100000)

# 使用交叉验证计算准确率（假设使用 5 折交叉验证）
cv_scores = cross_val_score(logistic_regression, X, y, cv=5, scoring='accuracy')
mean_accuracy = cv_scores.mean()
print("平均准确率:", mean_accuracy)
cv_scores1 = cross_val_score(logistic_regression, X, y, cv=5, scoring='precision')
print("平均精准度:", cv_scores1)
print("平均精准度:", cv_scores1.mean())

平均准确率: 0.8010611986343376
平均精准度: [0.76637555 0.75307951 0.79242081 0.79631636 0.81296615]
平均精准度: 0.7842316755244382


In [20]:
## 标准化
from sklearn.preprocessing import StandardScaler
# 选取了非0/1变量，非因变量的索引，用来做标准化
columns_to_normalize = df.columns[0:50].append(df.columns[51:55])
scaler = StandardScaler()
df_scaled=df.copy()
df_scaled[columns_to_normalize] = scaler.fit_transform(df_scaled[columns_to_normalize])
Xs = df_scaled.drop(target_column_name, axis=1)  
ys = df_scaled["癌"]  

logistic_regression_scaled = LogisticRegression(max_iter=100000)
# 使用交叉验证计算准确率（假设使用 5 折交叉验证）
cv_scores = cross_val_score(logistic_regression_scaled, Xs, ys, cv=5, scoring='accuracy')
mean_accuracy = cv_scores.mean()
print("平均准确率:", mean_accuracy)
cv_scores1 = cross_val_score(logistic_regression, X, y, cv=5, scoring='precision')
print("平均精准度:", cv_scores1.mean())


平均准确率: 0.8004977478199097
平均精准度: 0.7842316755244382


In [22]:
from sklearn.feature_selection import SequentialFeatureSelector
# 创建 SequentialFeatureSelector
sfs = SequentialFeatureSelector(logistic_regression, direction='forward')
# 使用逐步回归进行特征选择 
sfs.fit(Xs, ys)
# 输出最终选择的特征索引
selected_feature_indices = sfs.get_support(indices=True)
selected_features = Xs.columns[selected_feature_indices]
print("最终选择的特征:", selected_features)


最终选择的特征: Index(['血_中性粒细胞计数', '血_嗜酸粒细胞计数', '血_嗜酸粒细胞', '血_平均红细胞血红蛋白含量', '血_红细胞分布宽度',
       '血_钠', '血_葡萄糖', '血_肌酐', '血_糖类抗原CA199', '血_总胆红素', '血_直接胆红素',
       '血_门冬氨酸氨基转移酶', '血_总胆汁酸', '血_γ-谷氨酰转肽酶', '血_前白蛋白', '血_乳酸脱氢酶', '血_间接胆红素',
       '尿_比重', '尿_酸碱度', '尿_葡萄糖', '血_癌胚抗原', '体重', '身高', '年龄', 'BMI', '尿_白细胞',
       '尿_亚硝酸盐', '尿_酮体', '尿_胆红素', '粪_粪粘度', '粪_粪血液', '粪_粪红细胞', '粪_粪颜色', '粪_粪隐血',
       '粪_粪转铁蛋白'],
      dtype='object')


In [23]:
selected_features.tolist()

['血_中性粒细胞计数',
 '血_嗜酸粒细胞计数',
 '血_嗜酸粒细胞',
 '血_平均红细胞血红蛋白含量',
 '血_红细胞分布宽度',
 '血_钠',
 '血_葡萄糖',
 '血_肌酐',
 '血_糖类抗原CA199',
 '血_总胆红素',
 '血_直接胆红素',
 '血_门冬氨酸氨基转移酶',
 '血_总胆汁酸',
 '血_γ-谷氨酰转肽酶',
 '血_前白蛋白',
 '血_乳酸脱氢酶',
 '血_间接胆红素',
 '尿_比重',
 '尿_酸碱度',
 '尿_葡萄糖',
 '血_癌胚抗原',
 '体重',
 '身高',
 '年龄',
 'BMI',
 '尿_白细胞',
 '尿_亚硝酸盐',
 '尿_酮体',
 '尿_胆红素',
 '粪_粪粘度',
 '粪_粪血液',
 '粪_粪红细胞',
 '粪_粪颜色',
 '粪_粪隐血',
 '粪_粪转铁蛋白']