# 数据的预处理及划分

In [18]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

input_folder = "/Users/andywong/dataset/Dataset_BUSI_with_GT/"
data = []
labels = []

for category in ["benign", "malignant", "normal"]:
    category_input_path = os.path.join(input_folder, category)

    # 遍历类别文件夹中的图片
    for filename in os.listdir(category_input_path):
        file_path = os.path.join(category_input_path, filename)
        
        # 忽略包含 "_mask" 的图片
        if "_mask" not in filename:
            image = cv2.imread(file_path)
            
            if image is None: continue

            # 转为灰度图
            gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            # 调整大小
            resized_image = cv2.resize(gray_image, (256, 256))

            # 将图像展平为一维向量
            flattened_image = resized_image.flatten()
            
            data.append(flattened_image)
            labels.append(category)

data = np.array(data)
labels = np.array(labels)

# 标签编码为数字
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# 模型的训练

In [19]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
# 选择分类模型：决策树，随机森林，AdaBoost
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

# 训练并评估每个模型
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # 输出分类报告
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("-" * 50)

Model: Decision Tree
              precision    recall  f1-score   support

      benign       0.53      0.55      0.54        84
   malignant       0.38      0.35      0.36        43
      normal       0.17      0.17      0.17        29

    accuracy                           0.42       156
   macro avg       0.36      0.36      0.36       156
weighted avg       0.42      0.42      0.42       156

--------------------------------------------------
Model: Random Forest
              precision    recall  f1-score   support

      benign       0.62      0.94      0.75        84
   malignant       0.74      0.40      0.52        43
      normal       0.80      0.14      0.24        29

    accuracy                           0.64       156
   macro avg       0.72      0.49      0.50       156
weighted avg       0.68      0.64      0.59       156

--------------------------------------------------




Model: AdaBoost
              precision    recall  f1-score   support

      benign       0.60      0.92      0.72        84
   malignant       0.88      0.33      0.47        43
      normal       0.55      0.21      0.30        29

    accuracy                           0.62       156
   macro avg       0.67      0.48      0.50       156
weighted avg       0.66      0.62      0.58       156

--------------------------------------------------


# 超参数调优

In [None]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report
from scipy.stats import randint

# 超参数搜索空间
param_dist = {
    "Decision Tree": {
        'max_depth': randint(1, 20),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['auto', 'sqrt', 'log2']
    },
    "Random Forest": {
        'n_estimators': randint(100, 500),
        'max_depth': randint(10, 50),
        'max_features': ['auto', 'sqrt', 'log2'],
        'class_weight': [None, 'balanced']
    },
    "AdaBoost": {
        'n_estimators': randint(50, 200),
        'learning_rate': [0.01, 0.1, 0.5, 1.0]
    }
}

for model_name, model in models.items():
    print(f"Hyperparameter tuning for {model_name}...")
    
    # 搜索最优参数
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist[model_name], 
                                       n_iter=50, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    print(f"Best Parameters for {model_name}: {random_search.best_params_}")
    print(f"Best Score for {model_name}: {random_search.best_score_}")

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(f"Classification report for {model_name}:\n{classification_report(y_test, y_pred)}")
    print("-" * 50)