In [2]:
# import numpy as np
from numpy.random import randn
import os
import cupy as cp
import json

class Linear:
    def __init__(self,Din,Dout):#upstream:从上游传下来的梯度
        
        self.Din=Din
        self.Dout=Dout

        
        self.W = cp.random.randn(Din, Dout) * cp.sqrt(2. / Din)# He初始化：权重乘以sqrt(2/Din)以控制方差
        self.b=cp.zeros((1,self.Dout))#常见错误，不能写成np.zeros(1,self.Dout)，一定注意有两层括号
        self.grad_W = 0
        self.grad_b = 0
        self.cache = None  # 存储前向传播的输入
    
    def forward(self,X):
        self.cache=X
        return X@self.W+self.b

    def backward(self,upstream):#我们要计算loss对于W，b的偏导，存在本层以及loss对于X的偏导，这是上一层的output
        X=self.cache
        #out=X*W+b,upstream=偏loss/偏out，下面我们要计算偏loss/偏W，也就是计算偏out/偏W
        self.grad_W=X.T@upstream
        self.grad_b=cp.sum(upstream, axis=0, keepdims=True)
        return upstream @ self.W.T  # grad是dL/dY，self.W是当前层的权重矩阵


class ReLU:
    def __init__(self):
        self.cache = None

    def forward(self, X):
        self.cache = X
        return cp.maximum(0, X)

    def backward(self, upstream):
        X = self.cache
        return upstream * (X > 0)

class Sigmoid:
    def __init__(self):
        self.cache = None

    def forward(self, X):
        output = 1 / (1 + cp.exp(-X))
        self.cache = output
        return output

    def backward(self, upstream):
        output = self.cache
        return upstream * output * (1 - output)

class Tanh:
    def __init__(self):
        self.cache = None

    def forward(self, X):
        output = cp.tanh(X)
        self.cache = output
        return output

    def backward(self, upstream):
        output = self.cache
        return upstream * (1 - output ** 2)

class LeakyReLU:
    def __init__(self, negative_slope=0.01):
        self.cache = None
        self.negative_slope = negative_slope

    def forward(self, X):
        self.cache = X
        return cp.where(X > 0, X, self.negative_slope * X)

    def backward(self, upstream):
        X = self.cache
        dX = cp.where(X > 0, 1, self.negative_slope)
        return upstream * dX

class ELU:
    def __init__(self, alpha=1.0):
        self.cache = None
        self.alpha = alpha

    def forward(self, X):
        self.cache = X
        return cp.where(X > 0, X, self.alpha * (cp.exp(X) - 1))

    def backward(self, upstream):
        X = self.cache
        mask = (X > 0).astype(float)
        dX = mask + (1 - mask) * self.alpha * cp.exp(X)
        return upstream * dX

class Softmax:
    def __init__(self, axis=-1):
        self.cache = None
        self.axis = axis

    def forward(self, X):
        max_X = cp.max(X, axis=self.axis, keepdims=True)
        exp_X = cp.exp(X - max_X)  # 数值稳定性优化
        sum_exp = cp.sum(exp_X, axis=self.axis, keepdims=True)
        output = exp_X / sum_exp
        self.cache = output
        return output

    def backward(self, upstream):
        output = self.cache
        axis = self.axis
        sum_term = cp.sum(output * upstream, axis=axis, keepdims=True)
        return output * (upstream - sum_term)

class Swish:
    def __init__(self, beta=1.0):
        self.cache = (None, None)
        self.beta = beta

    def forward(self, X):
        beta_X = self.beta * X
        sigmoid = 1 / (1 + cp.exp(-beta_X))
        output = X * sigmoid
        self.cache = (X, sigmoid)
        return output

    def backward(self, upstream):
        X, sigmoid = self.cache
        beta = self.beta
        dX = sigmoid + beta * X * sigmoid * (1 - sigmoid)
        return upstream * dX



class myNet:
    
    # 激活函数映射字典，这样子去动态选择激活函数(不需要放在init里面）    
    ACTIVATION_FUNCTIONS = {
        'relu': ReLU,
        'sigmoid': Sigmoid,
        'tanh': Tanh,
        'leaky_relu': LeakyReLU,
        'elu': ELU,
        'softmax':Softmax,
        'swish': Swish
    }
        
    def __init__(self,input_dim,hidden_dim1,hidden_dim2,output_dim,activation):

        #把这些变量保存为实例变量，用于我们等会儿的save操作
        self.input_dim = input_dim
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.output_dim = output_dim
        self.activation = activation

        
        self.activation_class = self.ACTIVATION_FUNCTIONS[activation]
        self.layers=[
            Linear(input_dim,hidden_dim1),
            self.activation_class(),
            Linear(hidden_dim1,hidden_dim2),
            self.activation_class(),
            Linear(hidden_dim2,output_dim)
            ]

    def forward(self,X):
        for layer in self.layers:
            X=layer.forward(X)
        return X

    def backward(self,upstream):
        for layer in reversed(self.layers):
            upstream=layer.backward(upstream)#梯度信息都存在layer里面了
            # print(f"Layer {i} grad mean: {np.mean(upstream):.4f}")  # 调试
    
    def parameters(self):
        params = []
        for layer in self.layers:
            if isinstance(layer, Linear):
                params.extend([layer.W, layer.b])
        return params

    def gradients(self):
        grads = []
        for layer in self.layers:
            if isinstance(layer, Linear):
                grads.extend([layer.grad_W, layer.grad_b])
        return grads
        
    def zero_grad(self):
        for layer in self.layers:
            if isinstance(layer, Linear):
                layer.grad_W = cp.zeros_like(layer.W)  # 重置为全零
                layer.grad_b = cp.zeros_like(layer.b) 
                
    #CuPy 的 save 方法会保留数组的 GPU 属性，避免将数据传回 CPU 再保存   
    #为了超参数那个任务，我们的save和load需要能够保存整个网络结构+保存参数，而不仅仅只是保存参数
    #网络结构中含有字符串，我们保存为json格式；参数部分全是数字，我们直接使用cupy从GPU存回来
    def save_model(self, stage=1):
        save_dir = "parameters_stage1" if stage == 1 else "parameters_stage2"
        os.makedirs(save_dir, exist_ok=True)
        
        # 保存结构元数据到JSON（支持字符串）
        metadata = {
            'input_dim': self.input_dim,
            'hidden_dim1': self.hidden_dim1,
            'hidden_dim2': self.hidden_dim2,
            'output_dim': self.output_dim,
            'activation': self.activation  # 直接保存字符串
        }
        with open(os.path.join(save_dir, "metadata.json"), 'w') as f:
            json.dump(metadata, f)
        
        # 保存权重（CuPy格式）
        for i, layer in enumerate([l for l in self.layers if isinstance(l, Linear)]):
            cp.save(os.path.join(save_dir, f"layer_{i}_weight.npy"), layer.W)
            cp.save(os.path.join(save_dir, f"layer_{i}_bias.npy"), layer.b)
            
    def load_model(self,stage=1):
    #cp.save 生成的 .npy 文件格式与 NumPy 兼容，可用 np.load 读取（但会加载到 CPU）
    #而我们使用cupy：直接从文件加载到 GPU 内存，保持与模型其他部分（如权重）的 GPU 一致性
        
        save_dir = "parameters_stage1" if stage == 1 else "parameters_stage2"
        
        # 加载结构元数据
        with open(os.path.join(save_dir, "metadata.json"), 'r') as f:
            metadata = json.load(f)  # 直接加载字符串
        
        # 重建模型结构
        self.__init__(
            input_dim=metadata['input_dim'],
            hidden_dim1=metadata['hidden_dim1'],
            hidden_dim2=metadata['hidden_dim2'],
            output_dim=metadata['output_dim'],
            activation=metadata['activation']
        )
        
        # 加载权重（CuPy格式）
        layer_idx = 0
        for layer in self.layers:
            if isinstance(layer, Linear):
                layer.W = cp.load(os.path.join(save_dir, f"layer_{layer_idx}_weight.npy"))
                layer.b = cp.load(os.path.join(save_dir, f"layer_{layer_idx}_bias.npy"))
                layer_idx += 1
            

IndentationError: expected an indented block after 'else' statement on line 217 (2186840138.py, line 219)