## 全局变量与工具函数

In [30]:
import os

import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.utils import save_image
from scipy.stats.stats import pearsonr
from Progbar import Progbar

import pandas as pd
import numpy as np



In [31]:
def to_img(x):
    x = x.view(x.size(0), 1, 100, 50)
    return x


def norm(x, reverse=False):
    if reverse:
        y = np.power(10, x) - 1.01
        y = np.around(y).astype(np.int32)
        return y
    else:
        return np.log10(x + 1.01)


def minmax_0_to_1(x, reverse=False, minmax=1):
    if reverse:
        # x -> [0, 1]
        return x * minmax
        # minmax_x -> [0, 6]
    else:
        # norm_x -> [0, 6]
        return x / minmax
        # minmax_x -> [0, 1]


def reset_raw_from_norm(norm_x):
    return norm(
        minmax_0_to_1(
            minmax_0_to_1(norm_x, minmax=np.max(norm_x)), True, np.max(norm_x)), True)


def get_predict_and_true(output_data, simulated_csv_data_path, true_csv_data_path):
    a = pd.read_csv(simulated_csv_data_path)
    for i in range(2000):
        minmax = np.max(norm(a.iloc[:, i+1]))
        data = minmax_0_to_1(output_data[i][0], reverse=True, minmax=minmax)
        a.iloc[:, i+1] = norm(data, reverse=True)
    b = pd.read_csv(true_csv_data_path)
    return a, b


def calculate_pcc(arr1, arr2):
    PCC, _ = pearsonr(
        np.asarray(arr1).reshape(2000*5000),
        np.asarray(arr2).reshape(2000*5000))
    return PCC

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
num_epochs = 10
batch_size = 50
learning_rate = 1e-3
prefix = "my_norm"

## 数据加载器和模型定义

In [33]:
class SimulatedDataset(Dataset):
    '''
    每一个 Item 是 (1, 5000) 的向量
    transform 默认为归一化
    '''

    def __init__(self, simulated_csv_data_path, true_csv_data_path, transform=norm):
        self.simulated_csv_data = pd.read_csv(simulated_csv_data_path)
        self.true_csv_data_path = pd.read_csv(true_csv_data_path)
        self.transform = transform

    def __len__(self):
        return len(self.simulated_csv_data.columns) - 1

    def __getitem__(self, index):
        a_column_of_simulated_data = self.simulated_csv_data.iloc[:, index+1]
        a_column_of_true_data = self.true_csv_data_path.iloc[:, index+1]
        a_column_of_simulated_data = np.asarray(a_column_of_simulated_data).reshape(1, -1)  # (1, 5000)
        a_column_of_true_data = np.asarray(a_column_of_true_data).reshape(1, -1)

        if self.transform is not None:
            a_column_of_simulated_data = self.transform(a_column_of_simulated_data)
            a_column_of_true_data = self.transform(a_column_of_true_data)
        simulated_true_pack = (a_column_of_simulated_data, a_column_of_true_data)
        return simulated_true_pack

In [43]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(5000, 512),
            nn.ReLU(True),
            nn.Linear(512, 128),
#             nn.ReLU(True),
#             nn.Linear(128, 64),
            nn.ReLU(True))
        self.decoder = nn.Sequential(
            # nn.Linear(64, 128),
            # nn.ReLU(True),
            nn.Linear(128, 512),
            nn.ReLU(True),
            nn.Linear(512, 5000),
            nn.Sigmoid())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

## 训练，测试，预测

In [63]:
def train_model(dataloader, model, criterion, optimizer, num_epochs=10, device="cpu", get_data_callback=None, vis_data_callback=None):
    for epoch in range(num_epochs):
        print('epoch [{}/{}]'.format(epoch + 1, num_epochs))
        prog = Progbar(len(dataloader))
        for i, data in enumerate(dataloader):
            (noisy_data, _) = data
            noisy_data = minmax_0_to_1(noisy_data, False, torch.max(noisy_data))
            noisy_data = Variable(noisy_data).float().to(device)
            # ===================forward=====================
            output = model(noisy_data)
            loss = criterion(output, noisy_data)
            mse = nn.MSELoss()(output, noisy_data).data
            np1 = output.cpu().detach().numpy().reshape(-1)
            np2 = noisy_data.cpu().detach().numpy().reshape(-1)
            PCC, p_value = pearsonr(np1, np2)
            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # =====================log=======================
            prog.update(i + 1, [("loss", loss.item()), ("MSE_loss", mse), ("PCC", PCC), ("p-value", p_value)])
            if get_data_callback is not None:
                get_data_callback(epoch, num_epochs, i, len(dataloader), loss.item(), mse, PCC, p_value)
        if vis_data_callback is not None:
            vis_data_callback(epoch, num_epochs, len(dataloader))

In [97]:
def eval_model(dataloader, model, criterion, optimizer,
               simulated_csv_data_path, true_csv_data_path, 
               prefix="new"):
    for data in dataloader:
        (noisy_data, _) = data
        noisy_data = Variable(noisy_data).float().to(device)
        noisy_data = minmax_0_to_1(noisy_data, False, torch.max(noisy_data))
        # ===================forward=====================
        output = model(noisy_data)
        loss = criterion(output, noisy_data)
        mse = nn.MSELoss()(output, noisy_data).data
        output_data = output.data.numpy()

        predict_df, true_df = get_predict_and_true(output_data, simulated_csv_data_path, true_csv_data_path)
        pcc = calculate_pcc(predict_df.iloc[:, 1:], true_df.iloc[:, 1:])

        print("predict PCC:{:.4f} MSE:{:.8f}".format(pcc, mse))

#         filepath = "./data/"+prefix+"_predict_PCC_{:.4f}_MSE_{:.8f}_".format(pcc, mse)+simulated_csv_data_path[7:]
#         predict_df.to_csv(filepath, index=0)
        return loss.item(), mse, pcc
        # 只有一个 batch, 一次全拿出来了，不会有第二个

In [48]:
def predict(simulated_csv_data_path="./data/counts_simulated_dataset1_dropout0.05.csv",
            true_csv_data_path="./data/true_counts_simulated_dataset1_dropout0.05.csv",
            save_model_filename="./model_dropout0.05.pth", num_epochs=10):
    dataset = SimulatedDataset(simulated_csv_data_path, true_csv_data_path)
    dataloader = DataLoader(dataset, batch_size=50, shuffle=True, num_workers=3)
    model = AutoEncoder().to(device)
    MSE_loss = nn.MSELoss()
    BCE_Loss = nn.BCELoss()
    criterion = MSE_loss
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    if os.path.exists(save_model_filename):
        model.load_state_dict(torch.load(save_model_filename, "cpu"))
    else:
        model.train()
        train_model(dataloader, model, criterion, optimizer, num_epochs, device)
        torch.save(model.state_dict(), save_model_filename)

    model.eval()
    dataloader2 = DataLoader(dataset, batch_size=2000, shuffle=True, num_workers=3)
    eval_model(dataloader2, model, criterion, optimizer, simulated_csv_data_path, true_csv_data_path, prefix)

In [None]:
# predict(
#     "./data/counts_simulated_dataset1_dropout0.05.csv",
#     "./data/true_counts_simulated_dataset1_dropout0.05.csv",
#     "./"+prefix+"_model_dropout0.05.pth"
# )
# predict(
#     "./data/counts_simulated_dataset1_dropout0.10.csv",
#     "./data/true_counts_simulated_dataset1_dropout0.10.csv",
#     "./"+prefix+"_model_dropout0.10.pth"
# )
# predict(
#     "./data/counts_simulated_dataset1_dropout0.15.csv",
#     "./data/true_counts_simulated_dataset1_dropout0.15.csv",
#     "./"+prefix+"_model_dropout0.15.pth"
# )
# predict(
#     "./data/counts_simulated_dataset1_dropout0.20.csv",
#     "./data/true_counts_simulated_dataset1_dropout0.20.csv",
#     "./"+prefix+"_model_dropout0.20.pth"
# )
# predict(
#     "./data/counts_simulated_dataset1_dropout0.25.csv",
#     "./data/true_counts_simulated_dataset1_dropout0.25.csv",
#     "./"+prefix+"_model_dropout0.25.pth"
# )

## 可视化

### 可视化准备数据

In [73]:
from pyecharts import Line
from pyecharts import Bar


# bar.add("服装", ["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"], [5, 20, 36, 10, 75, 90])
# bar.print_echarts_options() # 该行只为了打印配置项，方便调试时使用
# bar    # 生成本地 HTML 文件

In [88]:
line1 = Line("PCC/MSE 折线图", "每个epoch都收集数据 x:i, y:PCC/MSE")
y_data_pcc_mse = []

line2 = Line("PCC 折线图", "每个epoch都收集数据 x:i, y:PCC")
y_data_pcc = []

line3 = Line("MSE 折线图", "每个epoch都收集数据 x:i, y:MSE")
y_data_mse = []

def get_data_when_train(epoch=0, max_epoch=10, i=0, max_i=10, loss=0, mse=0, PCC=0, p_value=0):
    y_data_pcc_mse.append(PCC/float(mse))
    y_data_pcc.append(PCC)
    y_data_mse.append(float(mse))
#     bar.add(str(epoch), range(max_i), [5, 20, 36, 10, 75, 90])
    
def vis_to_Line(epoch=0, max_epoch=10, max_i=10):
    line1.add(str(epoch), [i for i in range(max_i)], y_data_pcc_mse) # 传值
    y_data_pcc_mse.clear()
    
    line2.add(str(epoch), [i for i in range(max_i)], y_data_pcc)
    y_data_pcc.clear()
    
    line3.add(str(epoch), [i for i in range(max_i)], y_data_mse)
    y_data_mse.clear()

In [101]:
def vis(simulated_csv_data_path="./data/counts_simulated_dataset1_dropout0.05.csv",
            true_csv_data_path="./data/true_counts_simulated_dataset1_dropout0.05.csv",
            save_model_filename="./model_dropout0.05.pth", num_epochs=5):
    dataset = SimulatedDataset(simulated_csv_data_path, true_csv_data_path)
    dataloader = DataLoader(dataset, batch_size=50, shuffle=True, num_workers=3)
    dataloader2 = DataLoader(dataset, batch_size=2000, shuffle=True, num_workers=3)
    model = AutoEncoder().to(device)
    MSE_loss = nn.MSELoss()
    BCE_Loss = nn.BCELoss()
    criterion = MSE_loss
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    
    
    line1 = Line("PCC/MSE 折线图", "每个epoch都收集数据 x:i, y:PCC/MSE")
    y_data_pcc_mse = []

    line2 = Line("PCC 折线图", "每个epoch都收集数据 x:i, y:PCC")
    y_data_pcc = []

    line3 = Line("MSE 折线图", "每个epoch都收集数据 x:i, y:MSE")
    y_data_mse = []

    def get_data_when_train(epoch=0, max_epoch=10, i=0, max_i=10, loss=0, mse=0, PCC=0, p_value=0):
        y_data_pcc_mse.append(PCC/float(mse))
        y_data_pcc.append(PCC)
        y_data_mse.append(float(mse))

    def vis_to_Line(epoch=0, max_epoch=10, max_i=10):
        line1.add(str(epoch), [i for i in range(max_i)], y_data_pcc_mse)
        y_data_pcc_mse.clear()

        line2.add(str(epoch), [i for i in range(max_i)], y_data_pcc)
        y_data_pcc.clear()

        line3.add(str(epoch), [i for i in range(max_i)], y_data_mse)
        y_data_mse.clear()

    model.train()
    train_model(dataloader, model, criterion, optimizer,
                num_epochs, device, 
                get_data_when_train, vis_to_Line)
    
    
        
    model.eval()
    loss, mse, pcc = eval_model(dataloader2, model, criterion, optimizer,
                                simulated_csv_data_path, true_csv_data_path, prefix)
    
    return line1, line2, line3, loss, mse, pcc

In [102]:
all_lines = []
all_loss = []
all_mse = []
all_pcc = []

line1, line2, line3, loss, mse, pcc = vis(
    "./data/counts_simulated_dataset1_dropout0.05.csv",
    "./data/true_counts_simulated_dataset1_dropout0.05.csv",
    "./"+prefix+"_model_dropout0.05.pth"
)
all_lines.append([line1, line2, line3])
all_loss.append(loss)
all_mse.append(mse)
all_pcc.append(pcc)

line1, line2, line3, loss, mse, pcc = vis(
    "./data/counts_simulated_dataset1_dropout0.10.csv",
    "./data/true_counts_simulated_dataset1_dropout0.10.csv",
    "./"+prefix+"_model_dropout0.10.pth"
)
all_lines.append([line1, line2, line3])
all_loss.append(loss)
all_mse.append(mse)
all_pcc.append(pcc)

line1, line2, line3, loss, mse, pcc = vis(
    "./data/counts_simulated_dataset1_dropout0.15.csv",
    "./data/true_counts_simulated_dataset1_dropout0.15.csv",
    "./"+prefix+"_model_dropout0.15.pth"
)
all_lines.append([line1, line2, line3])
all_loss.append(loss)
all_mse.append(mse)
all_pcc.append(pcc)

line1, line2, line3, loss, mse, pcc = vis(
    "./data/counts_simulated_dataset1_dropout0.20.csv",
    "./data/true_counts_simulated_dataset1_dropout0.20.csv",
    "./"+prefix+"_model_dropout0.20.pth"
)
all_lines.append([line1, line2, line3])
all_loss.append(loss)
all_mse.append(mse)
all_pcc.append(pcc)

line1, line2, line3, loss, mse, pcc = vis(
    "./data/counts_simulated_dataset1_dropout0.25.csv",
    "./data/true_counts_simulated_dataset1_dropout0.25.csv",
    "./"+prefix+"_model_dropout0.25.pth"
)
all_lines.append([line1, line2, line3])
all_loss.append(loss)
all_mse.append(mse)
all_pcc.append(pcc)

epoch [1/5]
epoch [2/5]
epoch [3/5]
epoch [4/5]
epoch [5/5]
predict PCC:0.8105 MSE:0.01967106
epoch [1/5]
epoch [2/5]
epoch [3/5]
epoch [4/5]
epoch [5/5]
predict PCC:0.7174 MSE:0.01946006
epoch [1/5]
epoch [2/5]
epoch [3/5]
epoch [4/5]
epoch [5/5]
predict PCC:0.4444 MSE:0.02604193
epoch [1/5]
epoch [2/5]
epoch [3/5]
epoch [4/5]
epoch [5/5]
predict PCC:0.2891 MSE:0.01927744
epoch [1/5]
epoch [2/5]
epoch [3/5]
epoch [4/5]
epoch [5/5]
predict PCC:0.0363 MSE:0.02374469


### 预测可视化

In [None]:
line1

In [None]:
line2

In [None]:
line3

In [103]:
all_pcc

[0.8104506312770101,
 0.7173988353524272,
 0.4443956361335285,
 0.28910790708546996,
 0.036333064527141884]

In [104]:
all_mse

[tensor(0.0197),
 tensor(0.0195),
 tensor(0.0260),
 tensor(0.0193),
 tensor(0.0237)]

In [107]:
mse_dropout_line = Line("MSE Dropout 折线图")
mse_dropout_line.add("MSE", [0.05, 0.10, 0.15, 0.20, 0.25], [float(i) for i in all_mse])

In [111]:
pcc_dropout_line = Line("PCC Dropout 折线图")
pcc_dropout_line.add("PCC", [0.05, 0.10, 0.15, 0.20, 0.25], [float(i) for i in all_pcc])

### 训练可视化

#### Dropout 0.05

In [112]:
all_lines[0][0]

In [113]:
all_lines[0][1]

In [114]:
all_lines[0][2]

#### Dropout 0.10

In [121]:
all_lines[1][0]

In [122]:
all_lines[1][1]

In [123]:
all_lines[1][2]

#### Dropout 0.15

In [124]:
all_lines[2][0]

In [125]:
all_lines[2][1]

In [126]:
all_lines[2][2]

#### Dropout 0.20

In [127]:
all_lines[3][0]

In [128]:
all_lines[3][1]

In [129]:
all_lines[3][2]

#### Dropout 0.25

In [130]:
all_lines[4][0]

In [131]:
all_lines[4][1]

In [132]:
all_lines[4][2]