# -----------------------对数据集的预处理--------------------------------------------

- ### 将数据集划分为训练集和验证集
- ### 从标注文件中提取图片及其对应的标签并做成一个csv文件
- ### 提起标签的真实名称与索引的对应关系用于预测

In [43]:
from scipy import io
import pandas as pd
import os
from PIL import Image
import pickle
import re

import numpy as np
import matplotlib.pyplot as plt

# 方法一：使用默认数据集的处理方法，只用准备固定的文件格式就行

In [34]:
"""打开标注文件"""
dog_train = io.loadmat('data/lists/train_list.mat') # 训练的标注文件 
train_images = dog_train['file_list']  # 所有训练样本的名称

In [88]:
"""将标签文件中的图片名称提取出来"""
file_paths = [img_path[0][0] for img_path in train_images] # list：['n02085782-Japanese_spaniel/n02085782_4574.jpg',...] 
# 提取准备创建文件夹的名字
path_names = [path.split('/')[0].split('-')[-1] for path in file_paths] # list: ['Chihuahua', 'Chihuahua'...]

In [89]:
"""将图片文件重新缩放并放在新生成的文件夹里"""
# 先定义一个大小调整的函数
def resize_image(image, size):
    return image.resize(size, Image.Resampling.LANCZOS)

In [90]:
# 设置图片的输入文件地址和保存文件地址
input_path = 'data/Stanford Dogs Dataset_datasets/Stanford Dogs Dataset_images_datasets/Images/'
output_train_path = 'data/resize_1/train/'

# 检查文件是否已经存在
for name in path_names:
    output_name = os.path.join(output_train_path, name)
    if not os.path.exists(output_name):
        os.makedirs(output_name)

In [91]:
# 将原始图片重新调整大小后保存到新的文佳加中
for names in file_paths:
    image_path = os.path.join(input_path, names)
    path_name = names.split('/')[0].split('-')[-1] 
    image_name = names.split('/')[-1]
    output_image_file = os.path.join(output_train_path, path_name)
    with open(image_path, 'rb') as f:
        with Image.open(f) as img:
            if img.mode == 'RGBA':
                img = img.convert('RGB')
            resized_img = resize_image(img, (224, 224))

    # 保存调整大小后的图像
    resized_img.save(os.path.join(output_image_file, image_name), resized_img.format)

In [93]:
"""同样的将测试集也准备好"""
dog_test = io.loadmat('data/lists/test_list.mat') # 训练的标注文件 
test_images = dog_test['file_list']  # 所有训练样本的名称
file_pathss = [img_path[0][0] for img_path in test_images] # list：['n02085782-Japanese_spaniel/n02085782_4574.jpg',...] 
# 提取准备创建文件夹的名字
path_names = [path.split('/')[0].split('-')[-1] for path in file_pathss] # list: ['Chihuahua', 'Chihuahua'...]

input_path = 'data/Stanford Dogs Dataset_datasets/Stanford Dogs Dataset_images_datasets/Images/'
output_train_path = 'data/resize_1/test/'

# 检查文件是否已经存在
for name in path_names:
    output_name = os.path.join(output_train_path, name)
    if not os.path.exists(output_name):
        os.makedirs(output_name)
    
# 将原始图片重新调整大小后保存到新的文佳加中
for names in file_pathss:
    image_path = os.path.join(input_path, names)
    path_name = names.split('/')[0].split('-')[-1] 
    image_name = names.split('/')[-1]
    output_image_file = os.path.join(output_train_path, path_name)
    with open(image_path, 'rb') as f:
        with Image.open(f) as img:
            if img.mode == 'RGBA':
                img = img.convert('RGB')
            resized_img = resize_image(img, (224, 224))

    # 保存调整大小后的图像
    resized_img.save(os.path.join(output_image_file, image_name), resized_img.format)

# 方法二：使用自定义数据集的处理方法，将所有图片全部放在一个文件夹里面
- 通过自定义数据集的方式进行训练


In [None]:
# 训练集
dog_train = io.loadmat('data/lists/train_list.mat')
train_images = dog_train['file_list']
train_tar = dog_train['labels']

# 将提取出来的文件转化为字典，图片路径对应标签
train_image = []
train_target = []
for i in range(len(train_images)):
    x = train_images[i][0][0]
    x = re.search(r'([^\/]+)$', x).group(1)
    y = train_tar[i][0]
    train_image.append(x)
    train_target.append(y)
train_dict = dict(zip(train_image, train_target))

# 测试集
dog_test = io.loadmat('data/lists/test_list.mat')
test_images = dog_test['file_list']
test_tar = dog_test['labels']

# 将提取出来的文件转化为字典，图片路径对应标签
test_image = []
test_target = []
for i in range(len(test_images)):
    x = test_images[i][0][0]
    x = re.search(r'([^\/]+)$', x).group(1)
    y = test_tar[i][0]
    test_image.append(x)
    test_target.append(y)
test_dict = dict(zip(test_image, test_target))

# 将字典保存下来
with open('train_dict.pickle', 'wb') as f:
    pickle.dump(train_dict, f)
with open('test_dict.pickle', 'wb') as f:
    pickle.dump(test_dict, f)  

In [None]:
# 先定义一个大小调整的函数
def resize_image(image, size):
    return image.resize(size, Image.Resampling.LANCZOS)

# 将所有图象调整后输出到指定目录中---只需要执行一次
# 训练集
for key, value in train_dict.items():
    a = os.path.join('data/Stanford Dogs Dataset_datasets/Stanford Dogs Dataset_images_datasets/Images', key)
    image = re.search(r'([^\/]+)$', key).group(1)
    with open(a, 'r+b') as f:
        with Image.open(f) as img: # 打开图像
            if img.mode == 'RGBA': # 防止因为格式报错
                img = img.convert('RGB')
            img = resize_image(img, (224, 224)) # 调整图像大小
            img.save(os.path.join('data/resize/train/', image), img.format)# img.format用于包保留图像的格式

for key, value in test_dict.items():
    b = os.path.join('data/Stanford Dogs Dataset_datasets/Stanford Dogs Dataset_images_datasets/Images', key)
    image = re.search(r'([^\/]+)$', key).group(1)
    with open(b, 'r+b') as f:
        with Image.open(f) as img: # 打开图像
            if img.mode == 'RGBA':
                img = img.convert('RGB')
            img = resize_image(img, (224, 224)) # 调整图像大小
            img.save(os.path.join('data/resize/test/', image), img.format)# img.format用于包保留图像的格式
        
