In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
#Scaling
from sklearn.preprocessing import StandardScaler
#Spliting
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE

In [103]:
CervicalDF_original = pd.read_csv('risk_factors_cervical_cancer.csv')
CervicalDF_original.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [104]:
CervicalDF_original.replace("?", np.NaN, inplace=True)

In [105]:
CervicalDF_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 858 non-null    int64 
 1   Number of sexual partners           832 non-null    object
 2   First sexual intercourse            851 non-null    object
 3   Num of pregnancies                  802 non-null    object
 4   Smokes                              845 non-null    object
 5   Smokes (years)                      845 non-null    object
 6   Smokes (packs/year)                 845 non-null    object
 7   Hormonal Contraceptives             750 non-null    object
 8   Hormonal Contraceptives (years)     750 non-null    object
 9   IUD                                 741 non-null    object
 10  IUD (years)                         741 non-null    object
 11  STDs                                753 non-null    object

In [106]:
CervicalDF_original.describe()
CervicalDF_original.describe(include='all')

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,858.0,832.0,851.0,802.0,845.0,845.0,845.0,750.0,750.0,741.0,...,71.0,71.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0
unique,,12.0,21.0,11.0,2.0,30.0,62.0,2.0,40.0,2.0,...,18.0,18.0,,,,,,,,
top,,2.0,15.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,,,,,,,,
freq,,272.0,163.0,270.0,722.0,722.0,722.0,481.0,269.0,658.0,...,15.0,17.0,,,,,,,,
mean,26.820513,,,,,,,,,,...,,,0.020979,0.01049,0.020979,0.027972,0.040793,0.086247,0.051282,0.064103
std,8.497948,,,,,,,,,,...,,,0.143398,0.101939,0.143398,0.164989,0.197925,0.280892,0.220701,0.245078
min,13.0,,,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,,,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,,,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,,,,,,,,,,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
missing_data_count = CervicalDF_original.isnull().sum()
missing_data_count
# 找出有缺失值的列
missing_data_columns = missing_data_count[missing_data_count > 0]
#打印
print("Columns with missing data and the number of missing values:")
print(missing_data_columns)

Columns with missing data and the number of missing values:
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B

In [108]:
# 计算每一列的缺失值比例
missing_data_percentage = missing_data_count / len(CervicalDF_original)
# 删除缺失值超过50%的列
CervicalDF = CervicalDF_original.loc[:, missing_data_percentage <= 0.5]
CervicalDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 34 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 858 non-null    int64 
 1   Number of sexual partners           832 non-null    object
 2   First sexual intercourse            851 non-null    object
 3   Num of pregnancies                  802 non-null    object
 4   Smokes                              845 non-null    object
 5   Smokes (years)                      845 non-null    object
 6   Smokes (packs/year)                 845 non-null    object
 7   Hormonal Contraceptives             750 non-null    object
 8   Hormonal Contraceptives (years)     750 non-null    object
 9   IUD                                 741 non-null    object
 10  IUD (years)                         741 non-null    object
 11  STDs                                753 non-null    object

In [109]:
categorical_cols = [
    'Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis', 'STDs:cervical condylomatosis',
    'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
    'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'Dx:Cancer',
    'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology', 'Biopsy','STDs:vaginal condylomatosis'
]

numerical_cols = [
    'Age','Number of sexual partners','First sexual intercourse','Num of pregnancies','Smokes (years)','Smokes (packs/year)',
   'Hormonal Contraceptives (years)','IUD (years)','STDs (number)','STDs: Number of diagnosis'
]
# # 将这些列转换为category类型，注意如果某些列本来就是数值类型，先转为字符串再转换为category
# CervicalDF[categorical_cols] = CervicalDF[categorical_cols].apply(lambda col: col.astype(str).astype('object'))
# CervicalDF[numerical_cols] = CervicalDF[numerical_cols].apply(lambda col: col.astype(str).astype('float'))
# # 检查转换结果
print(CervicalDF.dtypes)
CervicalDF[categorical_cols].head()
CervicalDF[numerical_cols].isna().sum()
CervicalDF[categorical_cols].isna().sum()

Age                                   float64
Number of sexual partners             float64
First sexual intercourse              float64
Num of pregnancies                    float64
Smokes                                 object
Smokes (years)                        float64
Smokes (packs/year)                   float64
Hormonal Contraceptives                object
Hormonal Contraceptives (years)       float64
IUD                                    object
IUD (years)                           float64
STDs                                   object
STDs (number)                         float64
STDs:condylomatosis                    object
STDs:cervical condylomatosis           object
STDs:vaginal condylomatosis            object
STDs:vulvo-perineal condylomatosis     object
STDs:syphilis                          object
STDs:pelvic inflammatory disease       object
STDs:genital herpes                    object
STDs:molluscum contagiosum             object
STDs:AIDS                         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CervicalDF[categorical_cols] = CervicalDF[categorical_cols].apply(lambda col: col.astype(str).astype('object'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CervicalDF[numerical_cols] = CervicalDF[numerical_cols].apply(lambda col: col.astype(str).astype('float'))


Smokes                                0
Hormonal Contraceptives               0
IUD                                   0
STDs                                  0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0
Dx:Cancer                             0
Dx:CIN                                0
Dx:HPV                                0
Dx                                    0
Hinselmann                            0
Schiller                              0
Citology                              0
Biopsy                                0
STDs:vaginal condylomatosis           0
dtype: int64