In [None]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch

In [None]:
data = pd.read_excel("../data/original_cohort_data.xlsx", sheet_name=1)
boundary = np.array(pd.read_excel("../data/feature_constraints.xlsx"))
minv = boundary[0, :]
maxv = boundary[1, :]
minv.shape, maxv.shape, data.shape

In [None]:
data.describe()

In [None]:
def normalize(x, minv, maxv):
    return (x - minv) / (maxv - minv)

def unnormalize(nx, minv, maxv):
    return nx * (maxv - minv) + minv

In [None]:
ndata = normalize(data, minv, maxv)

In [None]:
ndata.describe()

## 生成对抗网络模型

In [None]:

class discriminator(nn.Module):
    def __init__(self):
        super(discriminator, self).__init__()
        self.dis = nn.Sequential(
            nn.Linear(46, 32),  # 输入特征数为784，输出为512
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2),  # 进行非线性映射
            nn.Linear(32, 16),  # 进行一个线性映射
            nn.BatchNorm1d(16),
            nn.LeakyReLU(0.2),
            nn.Linear(16, 1),
            nn.Sigmoid()  # 也是一个激活函数，二分类问题中，
            # sigmoid可以班实数映射到【0,1】，作为概率值，
            # 多分类用softmax函数
        )

    def forward(self, x):
        x = self.dis(x)
        return x

class generator(nn.Module):
    def __init__(self):
        super(generator, self).__init__()
        self.gen = nn.Sequential(
            nn.Linear(5, 16),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(16),
            nn.LeakyReLU(0.2),
            nn.Linear(16, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2),
            nn.Linear(32, 46),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.gen(x)
        return x

In [None]:
data_train = np.array(ndata)
data_train = torch.FloatTensor(data_train)

criterion = nn.BCELoss()  # 是单目标二分类交叉熵函数
D = discriminator()
G = generator()

In [None]:
num_epoch = 10**5
z_dimension = 5


# if torch.cuda.is_available():
#     D = D.cuda()
#     G = G.cuda()

d_optimizer = torch.optim.Adam(D.parameters(), lr=0.00001)
g_optimizer = torch.optim.Adam(G.parameters(), lr=0.001)

for epoch in range(num_epoch):  # 进行多个epoch的训练
    num_sample = data_train.shape[0]
    
    real_label = torch.ones(num_sample, 1)
    fake_label = torch.zeros(num_sample, 1)
    # 计算真实图片的损失
    real_out = D(data_train)  # 将真实图片放入判别器中
    d_loss_real = criterion(real_out, real_label)  # 得到真实图片的loss
    real_scores = real_out  # 得到真实图片的判别值，输出的值越接近1越好
    # 计算假的图片的损失
    z = torch.randn(num_sample, z_dimension)
    fake_sample = G(z)  # 随机噪声放入生成网络中，生成一张假的图片
    fake_out = D(fake_sample)  # 判别器判断假的图片
    d_loss_fake = criterion(fake_out, fake_label)  # 得到假的图片的loss
    fake_scores = fake_out  # 得到假图片的判别值，对于判别器来说，假图片的损失越接近0越好
    # 损失函数和优化
    d_loss = d_loss_real + d_loss_fake  # 损失包括判真损失和判假损失
    d_optimizer.zero_grad()  # 在反向传播之前，先将梯度归0
    d_loss.backward()  # 将误差反向传播
    d_optimizer.step()  # 更新参数
    # ==================训练生成器============================
    z = torch.randn(num_sample, z_dimension) # 得到随机噪声
    fake_sample = G(z)  # 随机噪声输入到生成器中，得到一副假的图片
    output = D(fake_sample)  # 经过判别器得到的结果
    g_loss = criterion(output, real_label)  # 得到的假的图片与真实的图片的label的loss
    # bp and optimize
    g_optimizer.zero_grad()  # 梯度归0
    g_loss.backward()  # 进行反向传播
    g_optimizer.step()  # .step()一般用在反向传播后面,用于更新生成网络的参数
    # 打印中间的损失
    if epoch % 1000 == 0:
        print('Epoch[{}/{}],d_loss:{:.6f},g_loss:{:.6f} '
              'D real: {:.6f},D fake: {:.6f}'.format(
            epoch, num_epoch, d_loss.data.item(), g_loss.data.item(),
            real_scores.data.mean(), fake_scores.data.mean()  # 打印的是真实图片的损失均值
        ))
#     if epoch == 0 and i==len(dataloader)-1:
#         real_images = to_img(real_img.cuda().data)
#         save_image(real_images, './img2/real_images.png')
#     if i==len(dataloader)-1:
#         fake_images = to_img(fake_img.cuda().data)
#         save_image(fake_images, './img2/fake_images-{}.png'.format(epoch + 1))
# # 保存模型
# torch.save(G.state_dict(), './generator.pth')
# torch.save(D.state_dict(), './discriminator.pth')


In [None]:
# ... (你之前的 RR-GAN 生成代码) ...

# 1. 动态获取正确的指标名称
# 你的代码里已经读取了 data，并且 drop 掉了 "序号"
# data = pd.read_excel("../data/JM.xlsx", sheet_name=1).drop("序号", axis=1)
feature_columns = data.columns  # 直接获取列名，绝对不会错

# 2. 生成模拟数据 (与你之前的逻辑保持一致)
fake_num = 171
save_name = "synthetic_augmented_data.xlsx"  # 注意这里我改成了 .xlsx

z = torch.randn(fake_num, z_dimension)
fake_sample = G(z).detach().numpy()
fake_sample = unnormalize(fake_sample, minv, maxv)

# 3. 关键步骤：将 numpy 数组转换为带列名的 DataFrame
# 这样生成的 Excel 第一行就是你原始数据里的指标名称
df_fake = pd.DataFrame(fake_sample, columns=feature_columns)

# 4. 保存为 Excel，方便后续 XGBoost 读取
df_fake.to_excel("../data/%s" % save_name, index=False)

print(f"File saved successfully: {save_name}")
print(f"Data Shape: {df_fake.shape}")
print(f"Columns: {list(df_fake.columns)}") # 打印出来确认一下