## General Preparation
import libs

load training data

In [2]:
import sys
import os
import random
import numpy as np
import matplotlib.pyplot as plt

sources_dir = os.path.abspath('sources')
sys.path.insert(0, sources_dir)

# 现在直接导入data_utils
import data_utils


# 加载数据函数
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, num_dev=500):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing
    """
    # 关键步骤3: 使用data_utils模块加载数据集
    cifar10_dir = 'sources/datasets/cifar-10-batches-py'
    
    try:
        del X_train, y_train
        del X_test, y_test
        print('Clear previously loaded data.')
    except:
        pass

    # 关键步骤4: 通过data_utils模块调用load_CIFAR10函数
    X_train, y_train, X_test, y_test = data_utils.load_CIFAR10(cifar10_dir)
    
    # 其余的预处理代码保持不变...
    mask = list(range(num_training, num_training + num_validation))
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = list(range(num_training))
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = list(range(num_test))
    X_test = X_test[mask]
    y_test = y_test[mask]
    mask = np.random.choice(num_training, num_dev, replace=False)
    X_dev = X_train[mask]
    y_dev = y_train[mask]
    
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_val = np.reshape(X_val, (X_val.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))
    
    mean_image = np.mean(X_train, axis = 0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    X_dev -= mean_image
    
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
    X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
    X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev

# 关键步骤5: 运行并测试导入是否成功
print("尝试加载CIFAR-10数据集...")
try:
    # 测试加载一小部分数据
    X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()
    print('成功加载数据集!')
    print('训练数据形状:', X_train.shape)
    print('训练标签形状:', y_train.shape)
    
    # 可视化一个样本作为测试
    plt.figure()
    plt.imshow(X_train[0].reshape(32, 32, 3).astype('uint8'))
    plt.title(f'标签: {y_train[0]}')
    plt.show()
    
except Exception as e:
    print(f"加载数据集时出错: {str(e)}")
    # 添加额外调试信息
    print("检查 sources 目录是否存在:", os.path.exists('sources'))
    print("检查 data_utils.py 是否存在:", os.path.exists('sources/data_utils.py'))
    print("检查数据集路径是否存在:", os.path.exists('sources/datasets/cifar-10-batches-py'))


X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('dev data shape: ', X_dev.shape)
print('dev labels shape: ', y_dev.shape)

尝试加载CIFAR-10数据集...
成功加载数据集!
训练数据形状: (49000, 3073)
训练标签形状: (49000,)
加载数据集时出错: cannot reshape array of size 3073 into shape (32,32,3)
检查 sources 目录是否存在: True
检查 data_utils.py 是否存在: True
检查数据集路径是否存在: True
Train data shape:  (49000, 3073)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3073)
Validation labels shape:  (1000,)
Test data shape:  (1000, 3073)
Test labels shape:  (1000,)
dev data shape:  (500, 3073)
dev labels shape:  (500,)


<Figure size 640x480 with 0 Axes>

## Softmax Loss Implementation


In [4]:
# Complete the implementation of softmax_loss_naive and implement a (naive)
# version of the gradient that uses nested loops.
from builtins import range
import numpy as np
from random import shuffle
#from past.builtins import xrange
import time
X_dev = np.random.randn(100, 3073)  # 100个样本，每个3073维
y_dev = np.random.randint(0, 10, size=100)  # 100个标签，0-9
def softmax_loss_naive(W, X, y, reg):
    """
    Softmax loss function, naive implementation (with loops)

    Inputs have dimension D, there are C classes, and we operate on minibatches
    of N examples.

    Inputs:
    - W: A numpy array of shape (D, C) containing weights.
    - X: A numpy array of shape (N, D) containing a minibatch of data.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c means
      that X[i] has label c, where 0 <= c < C.
    - reg: (float) regularization strength, which contributes to the loss by a
      regularization term: 0.5 * reg * np.sum(W * W)

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W; an array of same shape as W
    """
    # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)

    num_classes = W.shape[1]
    num_train = X.shape[0]

    for i in range(num_train):
        scores = X[i].dot(W)
        scores -= np.max(scores)  # 数值稳定性
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores)

        # 计算损失
        correct_class_prob = probs[y[i]]
        loss += -np.log(correct_class_prob)

        # 计算梯度
        for j in range(num_classes):
            if j == y[i]:
                dW[:, j] += (probs[j] - 1) * X[i]
            else:
                dW[:, j] += probs[j] * X[i]

    # 平均损失和梯度
    loss /= num_train
    dW /= num_train

    # 添加正则化项
    loss += 0.5 * reg * np.sum(W * W)
    dW += reg * W

    return loss, dW

for _ in range(10):
  # Generate a random softmax weight matrix and use it to compute the loss.
  W = np.random.randn(3073, 10) * 0.0001
  loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)
  print('loss: %f' % loss)
  
print('the correct loss should be close to: %f' % (-np.log(0.1)))# (why?)

loss: 2.302949
loss: 2.302601
loss: 2.302797
loss: 2.303440
loss: 2.302851
loss: 2.302531
loss: 2.303054
loss: 2.301833
loss: 2.302281
loss: 2.301831
the correct loss should be close to: 2.302585


In [5]:
import numpy as np
from random import randrange

# 添加数值梯度检查实现
def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5):
    """
    Sample a few random elements and only return numerical
    gradient in these dimensions.
    """
    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])
        
        oldval = x[ix]
        x[ix] = oldval + h # increment by h
        fxph = f(x) # evaluate f(x + h)
        x[ix] = oldval - h # decrement by h
        fxmh = f(x) # evaluate f(x - h)
        x[ix] = oldval # reset
        
        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = abs(grad_numerical - grad_analytic) / (abs(grad_numerical) + abs(grad_analytic))
        print('numerical: %f analytic: %f, relative error: %e' % (grad_numerical, grad_analytic, rel_error))

# 假设 W, X_dev, y_dev 已经定义
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)

# 梯度检查
f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, grad, 10)

# 带正则化的梯度检查
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 5e1)
f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 5e1)[0]
grad_numerical = grad_check_sparse(f, W, grad, 10)

numerical: 0.008327 analytic: 0.008327, relative error: 5.768620e-10
numerical: -0.004247 analytic: -0.004247, relative error: 1.060834e-08
numerical: 0.017886 analytic: 0.017886, relative error: 2.319491e-09
numerical: -0.006369 analytic: -0.006369, relative error: 1.461090e-09
numerical: 0.019976 analytic: 0.019976, relative error: 1.345925e-09
numerical: -0.005925 analytic: -0.005925, relative error: 2.457122e-09
numerical: 0.017346 analytic: 0.017346, relative error: 6.309615e-10
numerical: -0.033851 analytic: -0.033851, relative error: 1.624538e-09
numerical: -0.010558 analytic: -0.010558, relative error: 4.713985e-11
numerical: -0.047035 analytic: -0.047035, relative error: 6.477082e-10
numerical: 0.030276 analytic: 0.030276, relative error: 6.519395e-10
numerical: 0.047860 analytic: 0.047860, relative error: 5.066405e-11
numerical: 0.007399 analytic: 0.007399, relative error: 2.393850e-09
numerical: 0.004019 analytic: 0.004019, relative error: 8.509952e-09
numerical: -0.034830 a

### Optional

In [7]:
# Now implement a vectorized version in softmax_loss_vectorized.
def softmax_loss_vectorized(W, X, y, reg):
    """
    Softmax loss function, vectorized version.
    Inputs and outputs are the same as softmax_loss_naive.
    """
    # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)
    num_train = X.shape[0]

    #############################################################################
    # TODO: Compute the softmax loss and its gradient using no explicit loops.  #
    #############################################################################
    
    # Compute scores
    scores = X.dot(W)
    
    # Numerical stability: subtract max score from each row
    scores -= np.max(scores, axis=1, keepdims=True)
    
    # Compute softmax probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    # Compute loss
    correct_logprobs = -np.log(probs[np.arange(num_train), y])
    loss = np.sum(correct_logprobs) / num_train
    loss += 0.5 * reg * np.sum(W * W)
    
    # Compute gradient
    dscores = probs.copy()
    dscores[np.arange(num_train), y] -= 1
    dW = X.T.dot(dscores) / num_train
    dW += reg * W

    return loss, dW

# The two versions (naive and vectorized) should compute the same results, but the vectorized version should be
# much faster.
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))

tic = time.time()
loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))

# use the Frobenius norm to compare the two versions of the gradient.
grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))
print('Gradient difference: %f' % grad_difference)

naive loss: 2.301831e+00 computed in 0.008573s
vectorized loss: 2.301831e+00 computed in 0.009058s
Loss difference: 0.000000
Gradient difference: 0.000000


# Softmax Classifier

In [9]:
class Softmax():
    def __init__(self):
        self.W = None

    def train(
        self,
        X,
        y,
        learning_rate=1e-3,
        reg=1e-5,
        num_iters=100,
        batch_size=200,
        verbose=False,
    ):
        """
        Train this classifier using stochastic gradient descent.

        Inputs:
        - X: A numpy array of shape (N, D) containing training data; there are N
          training samples each of dimension D.
        - y: A numpy array of shape (N,) containing training labels; y[i] = c
          means that X[i] has label 0 <= c < C for C classes.
        - learning_rate: (float) learning rate for optimization.
        - reg: (float) regularization strength.
        - num_iters: (integer) max number of steps to take when optimizing
        - batch_size: (integer) number of training examples to use at each step.
        - verbose: (boolean) If true, print progress during optimization.

        Outputs:
        A list containing the value of the loss function at each training iteration.
        """
        num_train, dim = X.shape
        num_classes = (
            np.max(y) + 1
        )  # assume y takes values 0...K-1 where K is number of classes
        if self.W is None:
            # lazily initialize W
            self.W = 0.001 * np.random.randn(dim, num_classes)

        # Run stochastic gradient descent to optimize W
        loss_history = []


        for it in range(num_iters):
            # Sample batch_size elements from the training data
            batch_indices = np.random.choice(num_train, batch_size, replace=True)
            X_batch = X[batch_indices]
            y_batch = y[batch_indices]

            # Compute loss and gradient
            loss, grad = softmax_loss_naive(self.W, X_batch, y_batch, reg)
            loss_history.append(loss)

            # Update weights
            self.W -= learning_rate * grad
            # *****END OF YOUR CODE *****

            if verbose and it % 100 == 0:
                print("iteration %d / %d: loss %f" % (it, num_iters, loss))

        return loss_history

    def predict(self, X):
        """
        Use the trained weights of this linear classifier to predict labels for
        data points.

        Inputs:
        - X: A numpy array of shape (N, D) containing training data; there are N
          training samples each of dimension D.

        Returns:
        - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
          array of length N, and each element is an integer giving the predicted
          class.
        """
        #y_pred = np.zeros(X.shape[0])
        ###########################################################################
        # TODO:                                                                   #
        # Implement this method. Store the predicted labels in y_pred.            #
        ###########################################################################
        # *****START OF YOUR CODE *****

        scores = X.dot(self.W)
        
        # Predict the class with the highest score
        y_pred = np.argmax(scores, axis=1)
        # *****END OF YOUR CODE *****
        return y_pred

    def loss(self, X_batch, y_batch, reg):
        # return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
        return softmax_loss_naive(self.W, X_batch, y_batch, reg)

In [10]:
# Train the Softmax Classifer on the CIFAR10 dataset

# Provided as a reference. You may change these hyperparameters
learning_rates = 1e-6
regularization_strengths = 1e3

softmax = Softmax()
softmax.train(X_train, y_train, learning_rates, regularization_strengths, num_iters=1000, verbose=True)

# Evaluate on test set
y_test_pred = softmax.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, ))

iteration 0 / 1000: loss 20.912928
iteration 100 / 1000: loss 14.976567
iteration 200 / 1000: loss 12.391928
iteration 300 / 1000: loss 10.049045
iteration 400 / 1000: loss 8.578553
iteration 500 / 1000: loss 7.248350
iteration 600 / 1000: loss 6.368575
iteration 700 / 1000: loss 5.552971
iteration 800 / 1000: loss 4.795354
iteration 900 / 1000: loss 4.281202
softmax on raw pixels final test set accuracy: 0.368000
