In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os
import shutil
from PIL import Image
import numpy as np
from sklearn.utils import resample


# 路径设置
labels_file_path = './diabetic-retinopathy-detection/trainLabels.csv'
images_dir = './diabetic-retinopathy-detection/train'
output_dir = './diabetic-retinopathy-detection/smote_images'

# 读取标签数据
data = pd.read_csv(labels_file_path)
print("原始类别分布：")
print(data['level'].value_counts())

# 准备图像路径和标签
image_paths = [os.path.join(images_dir, f"{img}.jpeg") for img in data['image']]
labels = data['level']

# 将图像加载为特征
def load_images_as_features(image_paths):
    features = []
    for path in image_paths:
        with Image.open(path) as img:
            img = img.resize((256, 256))  # 调整为统一大小
            features.append(np.array(img).flatten())  # 展平图像为1D向量
    return np.array(features)

features = load_images_as_features(image_paths)

# 手动下采样多数类至 2324
def downsample_majority_classes(features, labels, target_count=2324):
    data = pd.DataFrame(features)
    data['label'] = labels

    # 按类别分组
    grouped = data.groupby('label')

    # 下采样
    resampled_groups = []
    for label, group in grouped:
        if len(group) > target_count:
            group = group.sample(target_count, random_state=42)  # 下采样
        resampled_groups.append(group)
    
    # 合并
    resampled_data = pd.concat(resampled_groups)
    return resampled_data.iloc[:, :-1].values, resampled_data['label'].values

features_downsampled, labels_downsampled = downsample_majority_classes(features, labels)

# 使用 SMOTE 平衡数据
smote = SMOTE(sampling_strategy={0: 2324, 1: 2324, 2: 2324, 3: 2324, 4: 2324}, random_state=42)
features_resampled, labels_resampled = smote.fit_resample(features_downsampled, labels_downsampled)

print("平衡后的类别分布：")
print(pd.Series(labels_resampled).value_counts())

# 保存平衡后的图像到新的文件夹
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i, feature in enumerate(features_resampled):
    img_array = feature.reshape((256, 256, 3)).astype('uint8')  # 恢复图像
    img = Image.fromarray(img_array)
    img_name = f"balanced_{i}.jpeg"
    img.save(os.path.join(output_dir, img_name))

# 创建保存标签的 DataFrame
smote_labels = pd.DataFrame({
    'image': [f"balanced_{i}" for i in range(len(features_resampled))],
    'level': labels_resampled
})

# 保存为新的 CSV 文件
smote_labels_file_path = './diabetic-retinopathy-detection/smote_labels.csv'
smote_labels.to_csv(smote_labels_file_path, index=False)

print(f"SMOTE 标签保存到 {smote_labels_file_path}")

print(f"平衡后的图像已保存到 {output_dir}")

原始类别分布：
level
0    25810
2     5292
1     2443
3      873
4      708
Name: count, dtype: int64
平衡后的类别分布：
0    2324
1    2324
2    2324
3    2324
4    2324
Name: count, dtype: int64
SMOTE 标签保存到 ./diabetic-retinopathy-detection/smote_labels.csv
平衡后的图像已保存到 ./diabetic-retinopathy-detection/smote_images


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import os
from PIL import Image
import numpy as np

# 路径设置
labels_file_path = './diabetic-retinopathy-detection/trainLabels.csv'
images_dir = './diabetic-retinopathy-detection/train'
output_dir = './diabetic-retinopathy-detection/smote_images'

# 读取标签数据
data = pd.read_csv(labels_file_path)
print("原始类别分布：")
print(data['level'].value_counts())

# 准备图像路径和标签
image_paths = [os.path.join(images_dir, f"{img}.jpeg") for img in data['image']]
labels = data['level']

# 将图像加载为特征
def load_images_as_features(image_paths):
    features = []
    for path in image_paths:
        with Image.open(path) as img:
            img = img.resize((256, 256))  # 调整为统一大小
            features.append(np.array(img).flatten())  # 展平图像为1D向量
    return np.array(features)

features = load_images_as_features(image_paths)

# 手动下采样多数类至 2324
def downsample_majority_classes(features, labels, target_count=1000):
    data = pd.DataFrame(features)
    data['label'] = labels

    # 按类别分组
    grouped = data.groupby('label')

    # 下采样
    resampled_groups = []
    for label, group in grouped:
        if len(group) > target_count:
            group = group.sample(target_count, random_state=42)  # 下采样
        resampled_groups.append(group)
    
    # 合并
    resampled_data = pd.concat(resampled_groups)
    return resampled_data.iloc[:, :-1].values, resampled_data['label'].values

features_downsampled, labels_downsampled = downsample_majority_classes(features, labels)

# 使用 PCA 进行降维
pca = PCA(n_components=50)  # 保留前 100 个主成分
features_reduced = pca.fit_transform(features_downsampled)

# 使用 SMOTE 平衡数据
smote = SMOTE(sampling_strategy={0: 1000, 1: 1000, 2: 1000, 3: 1000, 4: 1000}, random_state=42)
features_resampled, labels_resampled = smote.fit_resample(features_reduced, labels_downsampled)

# 将数据恢复到原始维度
features_restored = pca.inverse_transform(features_resampled)

# 检查平衡后的类别分布
print("平衡后的类别分布：")
print(pd.Series(labels_resampled).value_counts())

# 保存平衡后的图像到新的文件夹，覆盖原始文件
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i, feature in enumerate(features_restored):
    img_array = feature.reshape((256, 256, 3)).astype('uint8')  # 恢复图像
    img = Image.fromarray(img_array)
    img_name = f"{data['image'].iloc[i]}.jpeg"  # 覆盖原文件名
    img.save(os.path.join(output_dir, img_name))

# 创建保存标签的 DataFrame
smote_labels = pd.DataFrame({
    'image': [data['image'].iloc[i] for i in range(len(features_restored))],
    'level': labels_resampled
})

# 保存为新的 CSV 文件
smote_labels_file_path = './diabetic-retinopathy-detection/smote_labels.csv'
smote_labels.to_csv(smote_labels_file_path, index=False)

print(f"SMOTE 标签保存到 {smote_labels_file_path}")
print(f"平衡后的图像已保存到 {output_dir}")

原始类别分布：
level
0    25810
2     5292
1     2443
3      873
4      708
Name: count, dtype: int64
平衡后的类别分布：
0    1000
1    1000
2    1000
3    1000
4    1000
Name: count, dtype: int64
SMOTE 标签保存到 ./diabetic-retinopathy-detection/smote_labels.csv
平衡后的图像已保存到 ./diabetic-retinopathy-detection/smote_images


In [2]:
import pandas as pd

# Load the uploaded CSV file
file_path = 'balanced_labels_with_sbs_smote.csv'
data = pd.read_csv(file_path)

# Analyze the count of each DR grade
dr_grade_counts = data['level'].value_counts()

# Display the counts
dr_grade_counts

level
3    1503
4    1503
0    1500
1    1500
2    1500
Name: count, dtype: int64

In [1]:
import pandas as pd

# Load the uploaded CSV file
file_path = 'balanced_labels_with_sbs_smote.csv'
data = pd.read_csv(file_path)

# Analyze the count of each DR grade
dr_grade_counts = data['level'].value_counts()

# Display the counts
dr_grade_counts

level
0    2324
1    2324
2    2324
3    2324
4    2324
Name: count, dtype: int64

In [7]:
import pandas as pd
import numpy as np
import os
import tensorflow
from tensorflow.keras.applications import ResNet50, VGG16,EfficientNetB3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# 加载预训练的VGG16模型
base_model = EfficientNetB3(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

# 冻结前四个卷积块
for layer in base_model.layers[:15]:  # 冻结至Block4的所有层
    layer.trainable = False

# 替换分类层
x = base_model.output
x = GlobalAveragePooling2D()(x)  # 全局平均池化
x = Dense(1000, activation='relu')(x)  # 第一层全连接层，1000个神经元
x = Dropout(0.5)(x)  # Dropout层，防止过拟合
x = Dense(100, activation='relu')(x)  # 第二层全连接层，100个神经元
x = Dense(5, activation='softmax')(x)  # 输出层，5个类别（No DR, Mild DR, Moderate DR, Severe DR, Proliferative DR）

# 构建模型
model = Model(inputs=base_model.input, outputs=x)

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb3_notop.h5


In [8]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 256, 256, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling (Rescaling)          (None, 256, 256, 3)  0           ['input_3[0][0]']                
                                                                                                  
 normalization (Normalization)  (None, 256, 256, 3)  7           ['rescaling[0][0]']              
                                                                                                  
 rescaling_1 (Rescaling)        (None, 256, 256, 3)  0           ['normalization[0][0]']    