In [1]:
import pandas as pd
from torch.utils.data import Dataset
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import random
import numpy
import math
import os
import re
import torch.optim as optim
import time
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA A16


In [3]:
# dataset class

class MyDataset(Dataset):

    def __init__(self, csv_file):
        self.data_df = pd.read_csv(csv_file, header=None, sep='\t')
#         self.min_value = self.data_df.min().min()
#         self.max_value = self.data_df.max().max()
        
        self.min_value = self.data_df.min()
        self.max_value = self.data_df.max()

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, index):
#         image_values = torch.FloatTensor(self.data_df.iloc[index, 0:].values)
#         image_values = (image_values - self.min_value) / (self.max_value - self.min_value)
        
        image_values = torch.FloatTensor(self.data_df.iloc[index, 0:].values)

        # 计算最大值和最小值的差
        range_values = self.max_value - self.min_value
        # 处理全零列：如果最大值和最小值相等（即该列全为同一个值），则将范围设置为1以避免除以零
        range_values[range_values == 0] = 1

        # 进行归一化
        normalized_values = (image_values - self.min_value) / range_values
        
        return image_values.to(device)
    
    def plot_image(self, index):
        img = self.data_df.iloc[index, 0:].values.reshape(25, 233)
        # plt.title("label = " + str(self.data_df.iloc[index,0]))
        plt.title("label = " + self.filename)
        plt.imshow(img, interpolation='none', cmap='Blues')
        pass
    
    def unnormalize(self, tensor):
        return tensor * (self.max_value - self.min_value) + self.min_value
    
    pass

# functions to generate random data

def generate_random_image(size):
    random_data = torch.rand(size).to(device)
    return random_data


def generate_random_seed(size):
    random_data = torch.randn(size).to(device)
    return random_data


# discriminator class

class Discriminator(nn.Module):

    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(5825, 1200),
            nn.LeakyReLU(0.02),
            nn.LayerNorm(1200),
            nn.Linear(1200, 1),
            nn.Sigmoid()
        ).to(device)
        self.loss_function = nn.BCELoss()
        self.optimiser = optim.Adam(self.parameters(), lr=0.0001)

        # counter and accumulator for progress
        self.counter = 0;
        self.progress = []

        pass

    def forward(self, inputs):
        # simply run model
        return self.model(inputs)

    def train(self, inputs, targets):
        # calculate the output of the network
        outputs = self.forward(inputs)

        # calculate loss
        loss = self.loss_function(outputs, targets)

        # increase counter and accumulate error every 10
        self.counter += 1;
        if (self.counter % 10 == 0):
            self.progress.append(loss.item())
            pass
        if (self.counter % 10000 == 0):
            print("counter = ", self.counter)
            pass

        # zero gradients, perform a backward pass, update weights
        self.optimiser.zero_grad()
        loss.backward()
        self.optimiser.step()

        pass

    def plot_progress(self):
        df = pd.DataFrame(self.progress, columns=['loss'])
        df.plot(ylim=(0), figsize=(16, 8), alpha=0.1, marker='.', grid=True, yticks=(0, 0.25, 0.5))
        pass

    pass


# generator class

class Generator(nn.Module):

    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(100, 600),
            nn.LeakyReLU(0.02),
            nn.LayerNorm(600),
            nn.Linear(600, 5825),
            nn.Sigmoid()
        ).to(device)
        self.optimiser = optim.Adam(self.parameters(), lr=0.0001)
        # counter and accumulator for progress
        self.counter = 0;
        self.progress = []

    def forward(self, inputs):
        # simply run model
        return self.model(inputs)

    def train(self, D, inputs, targets):
        # calculate the output of the network
        g_output = self.forward(inputs)

        # pass onto Discriminator
        d_output = D.forward(g_output)

        # calculate error
        loss = D.loss_function(d_output, targets)

        # increase counter and accumulate error every 10
        self.counter += 1;
        if (self.counter % 10 == 0):
            self.progress.append(loss.item())
            pass

        # zero gradients, perform a backward pass, update weights
        self.optimiser.zero_grad()
        loss.backward()
        self.optimiser.step()

        pass

    def plot_progress(self):
        df = pd.DataFrame(self.progress, columns=['loss'])
        df.plot(ylim=(0), figsize=(16, 8), alpha=0.1, marker='.', grid=True, yticks=(0, 0.25, 0.5, 1.0))
        pass

    pass

# 定义一个从文件名中提取数字的函数
def extract_number(filename):
    match = re.search(r'\d+', filename)
    return int(match.group()) if match else 0

In [4]:
class MyDataset2(Dataset):#noisy_user

    def __init__(self, csv_file):
        self.data_df = pd.read_csv(csv_file, header=None, sep='\t')
#         self.min_value = self.data_df.min().min()
#         self.max_value = self.data_df.max().max()
        
        self.min_value = self.data_df.min()
        self.max_value = self.data_df.max()

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, index):
#         image_values = torch.FloatTensor(self.data_df.iloc[index, 0:].values)
#         image_values = (image_values - self.min_value) / (self.max_value - self.min_value)
        
        image_values = torch.FloatTensor(self.data_df.iloc[index, 0:].values)

        # 计算最大值和最小值的差
        range_values = self.max_value - self.min_value
        # 处理全零列：如果最大值和最小值相等（即该列全为同一个值），则将范围设置为1以避免除以零
        range_values[range_values == 0] = 1

        # 进行归一化
        normalized_values = (image_values - self.min_value) / range_values
        image_tensor = torch.tensor(normalized_values.values, dtype=torch.float)
        
        return image_tensor.to(device)
    
    def unnormalize(self, tensor):
        return tensor * (self.max_value - self.min_value) + self.min_value

In [5]:
def add_laplace_noise(data, scale):
    """为数据添加拉普拉斯噪声"""
    noise = numpy.random.laplace(0, scale, data.shape)
    noisy_data = data + noise * (data != 0)*0.01
    noisy_data[noisy_data < 0] = 0
    return noisy_data

In [6]:
user_noisy = []
total_output = []
rmse=0
filepath = './train_matrix.txt'
print(f"已加载文件: {filepath}")
dataset = MyDataset(filepath)

for k in range(339):
    image_data_tensor = dataset[k] #原始数据

    #start_time = time.time()  # 获取开始时间
    for i in range(300):# !!!!!!!!!!!!!!!!!!!!!!记得改回300
        if i == 0:
            # 第一轮使用原始数据
            real_data_tensor = image_data_tensor.cpu().numpy()
            user_noisy.append(real_data_tensor)
            real_data_tensor = torch.tensor(real_data_tensor, dtype=torch.float32, device=device)

        else:
            # 从第二轮开始添加噪声
            image_data_numpy = image_data_tensor.cpu().numpy()  # 转换为NumPy数组
            noisy_image_data_numpy = add_laplace_noise(image_data_numpy, 0.1)
            #计算加了噪声后的数据与原来数据的差异
            mse = numpy.mean((noisy_image_data_numpy - image_data_numpy) ** 2)
            #unnormalize_noisy_image_data_numpy = dataset.unnormalize(noisy_image_data_numpy)
           #print("第",i,"轮  ","MSE:", mse)
            user_noisy.append(noisy_image_data_numpy)
            # 将带有噪声的NumPy数组转换回PyTorch张量
            real_data_tensor = torch.tensor(noisy_image_data_numpy, dtype=torch.float32,device=device)

    user_noisy = numpy.array(user_noisy) 
    user_noisy.reshape(-1, 5825)
    numpy.savetxt('user_noisy.txt', user_noisy, delimiter='\t', fmt='%f')
    print("已保存文件user_noisy.txt")
    
    filepath = './user_noisy.txt'
    print(f"已加载文件: {filepath}")
    dataset2 = MyDataset2(filepath)

    D = Discriminator()

    #for k in range(339):
    #image_data_tensor = dataset2[0] #原始数据，

    start_time = time.time()  # 获取开始时间
    for i in range(300):# !!!!!!!!!!!!!!!!!!!!!!记得改回300
        image_data_tensor = dataset2[i]
        #训练判别器 - 真实数据
        D.train(real_data_tensor, torch.cuda.FloatTensor([1.0], device=device))

        #训练判别器 - 生成的假数据
        fake_data = generate_random_image(5825).to(device)
        D.train(fake_data, torch.cuda.FloatTensor([0.0], device=device))
        pass

    # create Discriminator and Generator
    # D = Discriminator().to(device)
    G = Generator().to(device)

    epochs = 4
    print("start, 开始第 {} 个用户的训练".format(k+1))
    #print("start, 开始第1个用户的训练")

    for epoch in range(epochs):
        print("epoch = ", epoch + 1)

        # train Discriminator and Generator

        for image_data_tensor in dataset2:
    #         image_data_tensor = torch.from_numpy(image_data_tensor).float()
    #         image_data_tensor = image_data_tensor.to(device)

            true_labels = torch.tensor([1.0], dtype=torch.float).to(device)
            D.train(image_data_tensor, true_labels)

            fake_images = G.forward(generate_random_seed(100)).detach()
            false_labels = torch.tensor([0.0], dtype=torch.float).to(device)
            D.train(fake_images, false_labels)

            # train generator
            for i in range(5):
                G.train(D, generate_random_seed(100), torch.tensor([1.0], dtype=torch.float).to(device))
            pass
    pass
    print("finish, 结束第 {} 个用户的训练".format(k+1))
    #print("finish, 结束第1个用户的训练")

    output = G.forward(generate_random_seed(100))
    output_cpu = output.detach().cpu().numpy()
    unnormalize_output_cpu = dataset2.unnormalize(output_cpu)
    total_output.append(unnormalize_output_cpu)
    #numpy.savetxt('output.txt', output_cpu, delimiter='\t', fmt='%f')


    sum = 0
    # 获取第k个用户真实数据
    #image_values = dataset[k]  # 使用__getitem__方法
    image_values = dataset2.data_df.iloc[0, 0:].values
    for i in range(5825):
        # 获取第k个用户第i个服务数据，与生成器结果进行计算误差
        sum += (output_cpu[i] - image_values[i].item())**2

    sum /= 5825
    sum = math.sqrt(sum)
    rmse += sum
    print("当前用户误差：" + str(sum))
    end_time = time.time()  # 获取结束时间
    elapsed_time = end_time - start_time  # 计算经过的时间
    print(f"Elapsed time: {elapsed_time} seconds")
    user_noisy = []

已加载文件: ./train_matrix.txt
已保存文件user_noisy.txt
已加载文件: ./user_noisy.txt
start, 开始第 1 个用户的训练
epoch =  1
epoch =  2
epoch =  3
epoch =  4
finish, 结束第 1 个用户的训练
当前用户误差：1.1586695888993337
Elapsed time: 63.763293743133545 seconds
已保存文件user_noisy.txt
已加载文件: ./user_noisy.txt
start, 开始第 2 个用户的训练
epoch =  1
epoch =  2
epoch =  3
epoch =  4
finish, 结束第 2 个用户的训练
当前用户误差：0.4687090034198183
Elapsed time: 63.78383755683899 seconds
已保存文件user_noisy.txt
已加载文件: ./user_noisy.txt
start, 开始第 3 个用户的训练
epoch =  1
epoch =  2
epoch =  3
epoch =  4
finish, 结束第 3 个用户的训练
当前用户误差：0.7933945046468757
Elapsed time: 63.71865439414978 seconds
已保存文件user_noisy.txt
已加载文件: ./user_noisy.txt
start, 开始第 4 个用户的训练
epoch =  1
epoch =  2
epoch =  3
epoch =  4
finish, 结束第 4 个用户的训练
当前用户误差：0.47935380619495577
Elapsed time: 63.567615270614624 seconds
已保存文件user_noisy.txt
已加载文件: ./user_noisy.txt
start, 开始第 5 个用户的训练
epoch =  1
epoch =  2
epoch =  3
epoch =  4
finish, 结束第 5 个用户的训练
当前用户误差：0.5856792997765888
Elapsed time: 63.75559449195862 seco

In [7]:
# filepath = './user_noisy.txt'
# print(f"已加载文件: {filepath}")
# dataset2 = MyDataset2(filepath)

# D = Discriminator()

# #for k in range(339):
# #image_data_tensor = dataset2[0] #原始数据，

# start_time = time.time()  # 获取开始时间
# for i in range(300):# !!!!!!!!!!!!!!!!!!!!!!记得改回300
#     image_data_tensor = dataset2[i]
#     #训练判别器 - 真实数据
#     D.train(real_data_tensor, torch.cuda.FloatTensor([1.0], device=device))

#     #训练判别器 - 生成的假数据
#     fake_data = generate_random_image(5825).to(device)
#     D.train(fake_data, torch.cuda.FloatTensor([0.0], device=device))
#     pass

# # create Discriminator and Generator
# # D = Discriminator().to(device)
# G = Generator().to(device)

# epochs = 4
# #print("start, 开始第 {} 个用户的训练".format(k+1))
# print("start, 开始第1个用户的训练")

# for epoch in range(epochs):
#     print("epoch = ", epoch + 1)

#     # train Discriminator and Generator

#     for image_data_tensor in dataset2:
# #         image_data_tensor = torch.from_numpy(image_data_tensor).float()
# #         image_data_tensor = image_data_tensor.to(device)

#         true_labels = torch.tensor([1.0], dtype=torch.float).to(device)
#         D.train(image_data_tensor, true_labels)

#         fake_images = G.forward(generate_random_seed(100)).detach()
#         false_labels = torch.tensor([0.0], dtype=torch.float).to(device)
#         D.train(fake_images, false_labels)

#         # train generator
#         for i in range(5):
#             G.train(D, generate_random_seed(100), torch.tensor([1.0], dtype=torch.float).to(device))
#         pass
# pass
# #print("finish, 结束第 {} 个用户的训练".format(k+1))
# print("finish, 结束第1个用户的训练")

# output = G.forward(generate_random_seed(100))
# output_cpu = output.detach().cpu().numpy()
# unnormalize_output_cpu = dataset2.unnormalize(output_cpu)
# total_output.append(unnormalize_output_cpu)
# #numpy.savetxt('output.txt', output_cpu, delimiter='\t', fmt='%f')


# sum = 0
# # 获取第k个用户真实数据
# #image_values = dataset[k]  # 使用__getitem__方法
# image_values = dataset[0]
# for i in range(5825):
#     # 获取第k个用户第i个服务数据，与生成器结果进行计算误差
#     sum += (output_cpu[i] - image_values[i].item())**2

# sum /= 5825
# sum = math.sqrt(sum)
# rmse += sum
# print("当前用户误差：" + str(sum))
# end_time = time.time()  # 获取结束时间
# elapsed_time = end_time - start_time  # 计算经过的时间
# print(f"Elapsed time: {elapsed_time} seconds")

In [8]:
total_output = numpy.array(total_output) 
total_output.reshape(-1, 5825)
numpy.savetxt('output.txt', total_output, delimiter='\t', fmt='%f')
print(rmse / 339)

0.5716889154195436
