In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"


# Step 1 : add mask

# 添加随机缺失到指定列
np.random.seed(42)  # 可复现性
gain_cols = ['output_1', 'output_2', 'output_3', 'output_4', 'output_5', 'output_6']
missing_rate = 0.2  # 缺失率 20%

df = pd.read_csv("simulated_23_anomaly_dataset.csv")

# 创建缺失 mask，并将对应位置设为 np.nan
for col in gain_cols:
    mask = np.random.rand(len(df)) < missing_rate
    df.loc[mask, col] = np.nan

# 查看每列缺失比例
print("缺失值比例（添加后）:")
print(df[gain_cols].isnull().mean().round(2))


gain_cols = ['output_1', 'output_2', 'output_3',
             'output_4', 'output_5', 'output_6']
data_x = df[gain_cols].values.astype(np.float32)

# Step 2: Normalization helpers
def normalization(data_x):
    mins = np.nanmin(data_x, axis=0)
    maxs = np.nanmax(data_x, axis=0)
    ranges = np.where(maxs - mins == 0, 1.0, maxs - mins)
    norm = (data_x - mins) / ranges
    norm[np.isnan(data_x)] = np.nan
    return norm, {'min': mins, 'max': maxs}

def renormalization(norm_x, params):
    return norm_x * (params['max'] - params['min']) + params['min']

# Step 3: GAIN model
class Generator(tf.keras.Model):
    def __init__(self, dim):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(dim, activation='relu')
        self.d2 = tf.keras.layers.Dense(dim, activation='relu')
        self.out = tf.keras.layers.Dense(dim, activation='sigmoid')
    def call(self, X, M, training=False):
        h = tf.concat([X, M], axis=1)
        return self.out(self.d2(self.d1(h)))

class Discriminator(tf.keras.Model):
    def __init__(self, dim):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(dim, activation='relu')
        self.d2 = tf.keras.layers.Dense(dim, activation='relu')
        self.out = tf.keras.layers.Dense(dim, activation='sigmoid')
    def call(self, X, H, training=False):
        h = tf.concat([X, H], axis=1)
        return self.out(self.d2(self.d1(h)))

# Step 4: GAIN training and imputation
def gain_tf2(data_x, params):
    norm_x, norm_params = normalization(data_x)
    miss_mask = (~np.isnan(norm_x)).astype(np.float32)
    norm_x_filled = np.nan_to_num(norm_x)

    n, dim = norm_x.shape
    bs, hint_rate, alpha, iters = params.values()

    G, D = Generator(dim), Discriminator(dim)
    G_opt = tf.keras.optimizers.Adam(1e-4)
    D_opt = tf.keras.optimizers.Adam(1e-4)

    for it in range(1, iters + 1):
        idx = np.random.choice(n, bs, replace=False)
        Xb = norm_x_filled[idx]
        Mb = miss_mask[idx]
        Zb = np.random.uniform(0, 0.01, size=Xb.shape)
        Hb = (np.random.uniform(0, 1, size=Xb.shape) < hint_rate) * Mb

        Xb_input = Mb * Xb + (1 - Mb) * Zb

        with tf.GradientTape(persistent=True) as tape:
            X_tensor = tf.convert_to_tensor(Xb_input, dtype=tf.float32)
            M_tensor = tf.convert_to_tensor(Mb, dtype=tf.float32)
            H_tensor = tf.convert_to_tensor(Hb, dtype=tf.float32)

            G_sample = G(X_tensor, M_tensor, training=True)
            Hat_X = M_tensor * X_tensor + (1 - M_tensor) * G_sample
            D_prob = D(Hat_X, H_tensor, training=True)

            D_loss = -tf.reduce_mean(M_tensor * tf.math.log(D_prob + 1e-8) +
                                     (1 - M_tensor) * tf.math.log(1 - D_prob + 1e-8))
            G_loss = -tf.reduce_mean((1 - M_tensor) * tf.math.log(D_prob + 1e-8))
            MSE_loss = tf.reduce_mean((M_tensor * X_tensor - M_tensor * G_sample) ** 2) / tf.reduce_mean(M_tensor)
            G_total = G_loss + alpha * MSE_loss

        D_grads = tape.gradient(D_loss, D.trainable_variables)
        G_grads = tape.gradient(G_total, G.trainable_variables)

        D_opt.apply_gradients(zip(D_grads, D.trainable_variables))
        G_opt.apply_gradients(zip(G_grads, G.trainable_variables))
        del tape

        if it % 100 == 0:
            print(f"[{it}/{iters}] D={D_loss:.4f} | G={G_loss:.4f} | MSE={MSE_loss:.4f}")

    # Final imputation
    Z_full = np.random.uniform(0, 0.01, size=norm_x.shape)
    X_input = miss_mask * norm_x_filled + (1 - miss_mask) * Z_full
    imputed_norm = G(tf.convert_to_tensor(X_input, dtype=tf.float32), tf.convert_to_tensor(miss_mask)).numpy()
    imputed = miss_mask * norm_x_filled + (1 - miss_mask) * imputed_norm
    return renormalization(imputed, norm_params)

# Step 5: Run GAIN
params = {'batch_size': 128, 'hint_rate': 0.9, 'alpha': 100, 'iterations': 2000}
imputed_output = gain_tf2(data_x, params)

# Replace missing values in df
df[gain_cols] = imputed_output
df.to_csv("simulated_23_anomaly_dataset_imputed.csv", index=False)
print("✅ Imputation completed and saved.")



缺失值比例（添加后）:
output_1    0.2
output_2    0.2
output_3    0.2
output_4    0.2
output_5    0.2
output_6    0.2
dtype: float64
[100/2000] D=0.7505 | G=0.1447 | MSE=0.0211
[200/2000] D=0.7419 | G=0.1228 | MSE=0.0225
[300/2000] D=0.7223 | G=0.1327 | MSE=0.0242
[400/2000] D=0.6995 | G=0.1229 | MSE=0.0217
[500/2000] D=0.6818 | G=0.1377 | MSE=0.0209
[600/2000] D=0.6623 | G=0.1138 | MSE=0.0217
[700/2000] D=0.6563 | G=0.1371 | MSE=0.0212
[800/2000] D=0.6430 | G=0.1347 | MSE=0.0241
[900/2000] D=0.6384 | G=0.1216 | MSE=0.0238
[1000/2000] D=0.6254 | G=0.1235 | MSE=0.0216
[1100/2000] D=0.6026 | G=0.1376 | MSE=0.0202
[1200/2000] D=0.6003 | G=0.1137 | MSE=0.0205
[1300/2000] D=0.5724 | G=0.1130 | MSE=0.0251
[1400/2000] D=0.5571 | G=0.1196 | MSE=0.0236
[1500/2000] D=0.5457 | G=0.1259 | MSE=0.0211
[1600/2000] D=0.5436 | G=0.1072 | MSE=0.0232
[1700/2000] D=0.5318 | G=0.1190 | MSE=0.0212
[1800/2000] D=0.5045 | G=0.0978 | MSE=0.0207
[1900/2000] D=0.4972 | G=0.0995 | MSE=0.0210
[2000/2000] D=0.4830 | G=0.0945

In [None]:
# Step 1: Load your data
df = pd.read_csv("simulated_23_anomaly_dataset.csv")


gain_cols = ['output_1', 'output_2', 'output_3',
             'output_4', 'output_5', 'output_6']
data_x = df[gain_cols].values.astype(np.float32)