In [3]:
import pandas as pd

# 加载乳腺癌数据集
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
                'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size',
                'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv('data/breast-cancer-wisconsin.csv', names=column_names)

# 处理缺失值
data = data.replace('?', pd.NA)  # 将问号替换为NA
data = data.dropna()  # 删除包含缺失值的行

# 显示数据集前几行
print("数据集预览:")
print(data.head())
print(data.info())

数据集预览:
   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             1016277                6                        8   
4             1017023                4                        1   

   Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                         1                  1                            2   
1                         4                  5                            7   
2                         1                  1                            2   
3                         8                  1                            3   
4                         1                  3                            2   

  Bare Nuclei  Bland Chromatin  Normal Nucleoli  Mitoses  Class  
0           1                3                1  

In [7]:
# 准备特征和标签
X = data.drop(['Sample code number', 'Class'], axis=1)  # 删除样本编号和标签列
y = data['Class']  # 标签列

# 将数据集分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 对数据进行标准化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # 对训练集进行拟合和转换
X_test = scaler.transform(X_test)  # 对测试集仅进行转换


# 创建和训练逻辑回归模型
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)#训练模型

# 在测试集上进行预测和评估
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)

# 打印模型性能
print("模型准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:")
print(classification_report(y_test, y_pred))#classification_report是分类报告,y_test是真实值,y_pred是预测值


模型准确率: 0.9562043795620438

分类报告:
              precision    recall  f1-score   support

           2       0.94      0.99      0.96        79
           4       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137



In [8]:
# 打印训练集和测试集的准确率
train_pred = model.predict(X_train)#预测训练集
train_accuracy = accuracy_score(y_train, train_pred)#计算训练集准确率
test_accuracy = accuracy_score(y_test, y_pred)#计算测试集准确率

print("训练集准确率:", train_accuracy)
print("测试集准确率:", test_accuracy)

# 打印详细的分类报告
print("\n训练集分类报告:")
print(classification_report(y_train, train_pred))
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred))


训练集准确率: 0.9706959706959707
测试集准确率: 0.9562043795620438

训练集分类报告:
              precision    recall  f1-score   support

           2       0.98      0.98      0.98       365
           4       0.96      0.96      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546


测试集分类报告:
              precision    recall  f1-score   support

           2       0.94      0.99      0.96        79
           4       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137

