In [1]:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# objectA - objectC 关系矩阵
R = sio.loadmat('../../project/matrices/R1-3.mat')
R

{'R_matr': <50x40 sparse matrix of type '<class 'numpy.float64'>'
 	with 209 stored elements in Compressed Sparse Column format>,
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Fri Jan 25 19:42:45 2019',
 '__version__': '1.0'}

In [3]:
data = R['R_matr']
data

<50x40 sparse matrix of type '<class 'numpy.float64'>'
	with 209 stored elements in Compressed Sparse Column format>

In [4]:
def getNumpyDataFromMatFile(filename = '../../project/matrices/R1-3.mat'):
    return sio.loadmat(filename)['R_matr']
getNumpyDataFromMatFile()

<50x40 sparse matrix of type '<class 'numpy.float64'>'
	with 209 stored elements in Compressed Sparse Column format>

In [5]:
from sklearn.metrics import matthews_corrcoef

In [6]:
matrix = data.todense()

In [7]:
for i in matrix:
    print(i)

[[0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.       0.912053 0.
  0.       0.       0.114144 0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.711648 0.       0.       0.      ]]
[[0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.       0.       0.11607
  0.       0.       0.       0.       0.       0.       0.       0.
  0.90975  0.       0.199099 0.       0.       0.528818 0.       0.
  0.978274 0.806647 0.       0.       0.154204 0.984133 0.       0.      ]]
[[0.       0.       0.       0.       0.       0.       0.426262 0.
  0.151101 0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.871504 0.       0.
  0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.     

In [48]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import Tensor

In [95]:
class LinearAutoEncoder(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearAutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(True),
            nn.Linear(32, 24),
            nn.ReLU(True),
            nn.Linear(24, 16),
            nn.ReLU(True))
        self.decoder = nn.Sequential(
            nn.Linear(16, 24),
            nn.ReLU(True),
            nn.Linear(24, 32),
            nn.ReLU(True),
            nn.Linear(32, output_size),
            nn.Sigmoid())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [16]:
from torch.utils.data import DataLoader, Dataset

In [44]:
class LinearDataset(Dataset):
    '''
    每一个 Item 是 (1, input_size) 的向量，元素范围[0, 1]
    '''

    def __init__(self, data, transform=None):
        self.matrix = data.todense()
        self.transform = transform

    def __len__(self):
        return len(self.matrix)

    def __getitem__(self, index):
        it = matrix[index]

        if self.transform is not None:
            it = self.transform(np.asarray(it))
            
        return it

In [14]:
data.shape

(50, 40)

In [34]:
from Progbar import Progbar
from scipy.stats.stats import pearsonr
import os

In [98]:
def calculate_pcc_mse(output, noisy_data, MSE_loss):
    mse = MSE_loss(output, noisy_data).data
    np1 = output.cpu().detach().numpy().reshape(-1)
    np2 = noisy_data.cpu().detach().numpy().reshape(-1)
    PCC, _ = pearsonr(np1, np2)

    return PCC, mse
def predict(matrix, device="cpu", num_epochs=20):
    shape = matrix.shape
    dataset = LinearDataset(matrix, Tensor)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=3)
    model = LinearAutoEncoder(shape[1], shape[1]).to(device)
    MSE_loss = nn.MSELoss()
    BCE_Loss = nn.BCELoss()
    criterion = MSE_loss
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

    # 训练
    if os.path.exists("./model.pth"):
        model.load_state_dict(torch.load("./model.pth", "cpu"))
    else:
        model.train()
        for epoch in range(num_epochs):
            print('epoch [{}/{}]'.format(epoch + 1, num_epochs))
            prog = Progbar(len(dataloader))
            for i, data in enumerate(dataloader):
                noisy_data = data
                # ===================forward=====================
                output = model(noisy_data)
                loss = criterion(output, noisy_data)
                pcc, mse = calculate_pcc_mse(output, noisy_data, MSE_loss)
                # ===================backward====================
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                # =====================log=======================
                prog.update(i + 1, [("loss", loss.item()), ("MSE", mse), ("PCC", pcc)])
        torch.save(model.state_dict(), "./model.pth")

    # 预测、评价
    model.eval()
    dataloader2 = DataLoader(dataset, batch_size=shape[0], shuffle=True, num_workers=3)
    for data in dataloader2:
        noisy_data = data
        # ===================forward=====================
        output = model(noisy_data)
        loss = criterion(output, noisy_data)
        # =====================log and save==============
        return output, loss
        break  # 只有一个 batch, 一次全拿出来了，不会有第二个


In [109]:
output, loss = predict(data)
save_output(output, loss)

In [59]:
output.size()

torch.Size([50, 1, 40])

In [107]:
def save_output(output, loss):
    output_data_to_save = []
    for i in output.data:
        it_list = []
        for j in i[0].data:
            it = j.item()
            if it < 0.1:
                it_list.append(int(0))
            else:
                it_list.append(it)
        output_data_to_save.append(it_list)
    # output_data_to_save 50x40

    df_output = pd.DataFrame(output_data_to_save)
    df_output.to_csv("./output_"+str(loss.item())+".csv")

In [110]:
def save_matrix(matrix, path):
    matrix_data = []
    for i in np.asarray(matrix):
        matrix_data.append([j for j in i])
    df_matrix = pd.DataFrame(matrix_data)
    df_matrix.to_csv(path)

In [112]:
def save_mat(path = '../../project/matrices/R1-3.mat', output_path = "./R.csv"):
    Rmatrix = sio.loadmat(path)["R_attr"].todense()
    save_matrix(Rmatrix, output_path)

In [None]:
save_mat('../../project/matrices/R1-3.mat')