In [1]:
import tarfile
import os
import pickle
from torchvision import datasets, transforms
import numpy as np

In [2]:
os.getcwd()

'/Users/haoz/果壳课程/模式识别与机器学习/Pattern-recognition-and-machine-learning/project/src'

In [3]:
dataset_dir = '../data'
dataset_name = 'cifar-10-python.tar.gz'
dataset_path = os.path.join(dataset_dir, dataset_name)

In [4]:
def extract_tar_gz(filename, extract_path=None):
    # 如果extract_path为None，则使用文件所在的目录作为提取路径
    if extract_path is None:
        extract_path = os.path.dirname(filename)
    
    with tarfile.open(filename, 'r:gz') as tar:
        tar.extractall(path=extract_path)


In [5]:
# extract_tar_gz(dataset_path, dataset_dir)

In [6]:
extract_dataset_path = os.path.join(dataset_dir, 'cifar-10-batches-py')

In [7]:
# 准备数据集并预处理
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  # 将图片转化成32*32的尺寸
    transforms.RandomHorizontalFlip(),  # 图像一半的概率翻转，一半的概率不翻转
    transforms.ToTensor(),  # 将图片转换成形状为(C,H,W)的 Tensor 格式，且/255 归一化到[0,1.0]之间
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),  # R,G,B每层的归一化用到的均值和方差
])

In [8]:
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

In [9]:
download_path = os.path.join(dataset_dir, 'download')
if not os.path.exists(download_path):
    os.makedirs(download_path)

In [10]:
trainset = datasets.CIFAR10(root=download_path, train=True,download=True, transform=transform_train)
testset = datasets.CIFAR10(root=download_path,train=False,download=True,transform=transform_test)
classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [11]:
# 从训练集中提取特征和标签
X_train = trainset.data.astype('float64')  # 获取训练集图像数据
y_train = np.array(trainset.targets).reshape(-1)  # 获取训练集标签

In [12]:
# 从测试集中提取特征和标签
X_test = testset.data.astype('float64')  # 获取测试集图像数据
y_test = np.array(testset.targets).reshape(-1)  # 获取测试集标签

In [13]:
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

In [14]:
mean_image = np.mean(X_train, axis=0)

X_train -= mean_image
X_test -= mean_image

In [15]:
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

In [16]:
X_train.shape

(50000, 3073)

In [17]:
y_train.shape

(50000,)

In [18]:
X_test.shape

(10000, 3073)

In [19]:
y_test.shape

(10000,)

In [20]:
train_pkl_path = os.path.join(dataset_dir, 'train.pkl')
test_pkl_path = os.path.join(dataset_dir, 'test.pkl')

In [21]:
with open(train_pkl_path, 'wb') as f:
    pickle.dump((X_train, y_train), f)

In [22]:
with open(test_pkl_path, 'wb') as f:
    pickle.dump((X_test, y_test), f)