In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Scaling
from sklearn.preprocessing import StandardScaler
#Spliting
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import KNNImputer

In [26]:
CervicalDF = pd.read_csv('kag_risk_factors_cervical_cancer.csv')
CervicalDF.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [27]:
CervicalDF.replace("?", np.nan, inplace=True)

In [34]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(CervicalDF)

# 2. 使用 KNN 填充缺失值
knn_imputer = KNNImputer(n_neighbors=3)  # 选择 K = 3，可以根据需要调整
X_imputed = knn_imputer.fit_transform(X_scaled)

# 3. 转换填充后的数据为 DataFrame 格式
CervicalDF_imputed = pd.DataFrame(X_imputed, columns=CervicalDF.columns)

# 查看填充后的数据
print("\n填充后的数据：")
CervicalDF_imputed


填充后的数据：


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,-1.038563,0.883365,-0.712173,-0.881819,-0.412747,-0.298469,-0.203633,-1.337200,-0.599833,-0.355162,...,-0.024062,0.032041,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712
1,-1.391796,-0.916536,-1.069098,-0.881819,-0.412747,-0.298469,-0.203633,-1.337200,-0.599833,-0.355162,...,-0.764377,-0.726252,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712
2,0.845342,-0.916536,3.332979,-0.881819,-0.412747,-0.298469,-0.203633,-1.337200,-0.599833,-0.355162,...,-0.308799,-0.259610,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712
3,2.964736,1.483332,-0.355247,1.192136,2.422792,8.755520,16.423399,0.747831,0.197669,-0.355162,...,0.488464,-0.492931,6.831301,-0.10296,6.831301,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712
4,2.258271,0.283398,1.429378,1.192136,-0.412747,-0.298469,-0.203633,0.747831,3.387680,-0.355162,...,1.114884,1.198644,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,0.845342,0.283398,0.358603,-1.573137,-0.412747,-0.298469,-0.203633,-1.337200,-0.599833,-0.355162,...,0.545411,0.615342,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712
854,0.609854,-0.316569,0.715528,-0.881819,-0.412747,-0.298469,-0.203633,0.747831,1.526840,-0.355162,...,-0.251852,-0.201280,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712
855,-0.214355,-0.316569,0.001678,-1.573137,-0.412747,-0.298469,-0.203633,0.747831,-0.578567,-0.355162,...,-0.707430,-0.667921,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,4.301163,-0.261712
856,0.727598,-0.316569,2.500154,-0.190500,-0.412747,-0.298469,-0.203633,0.747831,-0.578567,-0.355162,...,-0.308799,-0.259610,-0.146385,-0.10296,-0.146385,-0.169638,-0.206222,-0.307226,-0.232495,-0.261712


In [38]:
# 3. 去标准化：将填充后的数据从标准化数据转换回原始尺度
X_original = scaler.inverse_transform(X_imputed)

# 4. 转换填充后的数据为 DataFrame 格式
CervicalDF_imputed = pd.DataFrame(X_original, columns=CervicalDF.columns)

# 查看填充后的数据
print("\n去标准化后的数据：")
CervicalDF_imputed


去标准化后的数据：


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18.0,4.0,15.000000,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,6.000000,6.000000,0.0,0.0,0.0,3.469447e-18,0.0,0.0,0.0,0.0
1,15.0,1.0,14.000000,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,1.666667,1.666667,0.0,0.0,0.0,3.469447e-18,0.0,0.0,0.0,0.0
2,34.0,1.0,26.333333,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,4.333333,4.333333,0.0,0.0,0.0,3.469447e-18,0.0,0.0,0.0,0.0
3,52.0,5.0,16.000000,4.0,1.0,37.0,37.0,1.0,3.00,0.0,...,9.000000,3.000000,1.0,0.0,1.0,3.469447e-18,0.0,0.0,0.0,0.0
4,46.0,3.0,21.000000,4.0,0.0,0.0,0.0,1.0,15.00,0.0,...,12.666667,12.666667,0.0,0.0,0.0,3.469447e-18,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34.0,3.0,18.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,9.333333,9.333333,0.0,0.0,0.0,3.469447e-18,0.0,0.0,0.0,0.0
854,32.0,2.0,19.000000,1.0,0.0,0.0,0.0,1.0,8.00,0.0,...,4.666667,4.666667,0.0,0.0,0.0,3.469447e-18,0.0,0.0,0.0,0.0
855,25.0,2.0,17.000000,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,2.000000,2.000000,0.0,0.0,0.0,3.469447e-18,0.0,0.0,1.0,0.0
856,33.0,2.0,24.000000,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,4.333333,4.333333,0.0,0.0,0.0,3.469447e-18,0.0,0.0,0.0,0.0
