In [4]:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
# objectA - objectC 关系矩阵
R = sio.loadmat('../../project/main/matrices/R1-3.mat')
R

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Fri Jan 25 19:42:45 2019',
 '__version__': '1.0',
 '__globals__': [],
 'R_matr': <50x40 sparse matrix of type '<class 'numpy.float64'>'
 	with 209 stored elements in Compressed Sparse Column format>}

In [6]:
data = R['R_matr']
data

<50x40 sparse matrix of type '<class 'numpy.float64'>'
	with 209 stored elements in Compressed Sparse Column format>

In [7]:
def getNumpyDataFromMatFile(filename = '../../project/main/matrices/R1-3.mat'):
    return sio.loadmat(filename)['R_matr']
getNumpyDataFromMatFile()

<50x40 sparse matrix of type '<class 'numpy.float64'>'
	with 209 stored elements in Compressed Sparse Column format>

In [8]:
from sklearn.metrics import matthews_corrcoef

In [9]:
matrix = data.todense()

In [10]:
for i in matrix:
    print(i)

[[0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.       0.912053 0.
  0.       0.       0.114144 0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.711648 0.       0.       0.      ]]
[[0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.       0.       0.11607
  0.       0.       0.       0.       0.       0.       0.       0.
  0.90975  0.       0.199099 0.       0.       0.528818 0.       0.
  0.978274 0.806647 0.       0.       0.154204 0.984133 0.       0.      ]]
[[0.       0.       0.       0.       0.       0.       0.426262 0.
  0.151101 0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.       0.       0.       0.871504 0.       0.
  0.       0.       0.       0.       0.       0.       0.       0.
  0.       0.       0.     

# AE

In [11]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import Tensor

In [12]:
class LinearAutoEncoder(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearAutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(True),
            nn.Linear(32, 24),
            nn.ReLU(True),
            nn.Linear(24, 16),
            nn.ReLU(True))
        self.decoder = nn.Sequential(
            nn.Linear(16, 24),
            nn.ReLU(True),
            nn.Linear(24, 32),
            nn.ReLU(True),
            nn.Linear(32, output_size),
            nn.Sigmoid())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [13]:
from torch.utils.data import DataLoader, Dataset

In [14]:
class LinearDataset(Dataset):
    '''
    每一个 Item 是 (1, input_size) 的向量，元素范围[0, 1]
    '''

    def __init__(self, data, transform=None):
        self.matrix = data.todense()
        self.transform = transform

    def __len__(self):
        return len(self.matrix)

    def __getitem__(self, index):
        it = matrix[index]

        if self.transform is not None:
            it = self.transform(np.asarray(it))
            
        return it

In [15]:
data.shape

(50, 40)

In [16]:
from Progbar import Progbar
from scipy.stats.stats import pearsonr
import os

In [17]:
def calculate_pcc_mse(output, noisy_data, MSE_loss):
    mse = MSE_loss(output, noisy_data).data
    np1 = output.cpu().detach().numpy().reshape(-1)
    np2 = noisy_data.cpu().detach().numpy().reshape(-1)
    PCC, _ = pearsonr(np1, np2)

    return PCC, mse
def predict(matrix, device="cpu", num_epochs=20):
    shape = matrix.shape
    dataset = LinearDataset(matrix, Tensor)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=3)
    model = LinearAutoEncoder(shape[1], shape[1]).to(device)
    MSE_loss = nn.MSELoss()
    BCE_Loss = nn.BCELoss()
    criterion = MSE_loss
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

    # 训练
    if os.path.exists("./model.pth"):
        model.load_state_dict(torch.load("./model.pth", "cpu"))
    else:
        model.train()
        for epoch in range(num_epochs):
            print('epoch [{}/{}]'.format(epoch + 1, num_epochs))
            prog = Progbar(len(dataloader))
            for i, data in enumerate(dataloader):
                noisy_data = data
                # ===================forward=====================
                output = model(noisy_data)
                loss = criterion(output, noisy_data)
                pcc, mse = calculate_pcc_mse(output, noisy_data, MSE_loss)
                # ===================backward====================
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                # =====================log=======================
                prog.update(i + 1, [("loss", loss.item()), ("MSE", mse), ("PCC", pcc)])
        torch.save(model.state_dict(), "./model.pth")

    # 预测、评价
    model.eval()
    dataloader2 = DataLoader(dataset, batch_size=shape[0], shuffle=True, num_workers=3)
    for data in dataloader2:
        noisy_data = data
        # ===================forward=====================
        output = model(noisy_data)
        loss = criterion(output, noisy_data)
        # =====================log and save==============
        return output, loss
        break  # 只有一个 batch, 一次全拿出来了，不会有第二个


In [18]:
def save_output(output, loss):
    output_data_to_save = []
    for i in output.data:
        it_list = []
        for j in i[0].data:
            it = j.item()
            if it < 0.1:
                it_list.append(int(0))
            else:
                it_list.append(it)
        output_data_to_save.append(it_list)
    # output_data_to_save 50x40

    df_output = pd.DataFrame(output_data_to_save)
    df_output.to_csv("./output_"+str(loss.item())+".csv")

In [19]:
output, loss = predict(data)
save_output(output, loss)

In [20]:
output.size()

torch.Size([50, 1, 40])

In [21]:
loss

tensor(0.0287, grad_fn=<MseLossBackward>)

## 看他重构的矩阵长什么样

他的是压缩成csv，需要把0补上去

In [22]:
def load_R(matPath):
    matrix = sio.loadmat(matPath)["R_matr"].todense()
    matrix_data = []
    for i in np.asarray(matrix):
        matrix_data.append([j for j in i])
    return matrix_data

def save_matrix(matrix, path):
    matrix_data = []
    for i in np.asarray(matrix):
        matrix_data.append([j for j in i])
    df_matrix = pd.DataFrame(matrix_data)
    df_matrix.to_csv(path)

In [23]:
def save_mat(path = '../../project/main/matrices/R1-3.mat', output_path = "./R.csv"):
    Rmatrix = sio.loadmat(path)["R_matr"].todense()
    save_matrix(Rmatrix, output_path)

In [24]:
save_mat('../../project/main/matrices/R1-3.mat')

In [25]:
def load_Object(matPath='../../project/main/objects/objectA.mat'):
    matrix = sio.loadmat(matPath)["vett"]
    new_matrix = []
    for i in matrix:
        new_matrix.append(i[0][0])
    return new_matrix

def save_Object_mat_to_csv(matPath='../../project/main/objects/objectA.mat', output_path="./objectA.csv"):
    new_matrix = load_Object(matPath)
    df_matrix = pd.DataFrame(new_matrix)
    df_matrix.to_csv(output_path)

save_Object_mat_to_csv(matPath='../../project/main/objects/objectA.mat', output_path="./objectA.csv")
save_Object_mat_to_csv(matPath='../../project/main/objects/objectB.mat', output_path="./objectB.csv")
save_Object_mat_to_csv(matPath='../../project/main/objects/objectC.mat', output_path="./objectC.csv")

In [26]:
matrix = sio.loadmat('../../project/main/objects/objectA.mat')["vett"]

In [27]:
new_matrix = []
for i in matrix:
    new_matrix.append(i[0][0])
new_matrix

['0k2cq',
 '0tsou',
 '2zt0y',
 '306fp',
 '36mab',
 '4718r',
 '4snwv',
 '4x99y',
 '6062q',
 '6xiit',
 '75581',
 '7nrit',
 '7tb58',
 '95saa',
 '97vve',
 '9fwxw',
 '9h24q',
 '9n6v4',
 '9njqe',
 'a9qsd',
 'aewa7',
 'b4bxs',
 'brhrx',
 'd2tzi',
 'edf28',
 'h2aeg',
 'hktsr',
 'i5rpt',
 'ickpq',
 'ijhu4',
 'm2k0x',
 'npuga',
 'o1ojm',
 'o6p8o',
 'o7hgm',
 'otwp4',
 'rcql6',
 'rpwiz',
 't0nen',
 't2ukq',
 't3l4h',
 'tx96a',
 'u0fbt',
 'u5ghe',
 'wcnoy',
 'wpok6',
 'wvk2n',
 'xqhhy',
 'y2i17',
 'ydda7']

In [28]:
def R_csv_to_complete_csv(path):
    Rcsv = pd.read_csv(path)


In [29]:
def to_complete_csv(path = '../../project/main/output_s/new_found_relations.csv', output_path="./new_found_relations.csv"):
    pd_idx = load_Object(matPath='../../project/main/objects/objectA.mat')
    pd_col = load_Object(matPath='../../project/main/objects/objectC.mat')
    pd_complete = pd.DataFrame(data=[[0.0]*50]*50, index = pd_idx, columns=pd_idx)
    with open(path, "r") as F:
        for i in F.readlines():
            ap = i.split(',')
            key_x = ap[0]
            key_y = ap[1]
            value = float(ap[2])
            pd_complete[key_x][key_y] = value
    pd_complete.to_csv(output_path)
    return pd_complete
pd_complete = to_complete_csv()
pd_complete

Unnamed: 0,0k2cq,0tsou,2zt0y,306fp,36mab,4718r,4snwv,4x99y,6062q,6xiit,...,t3l4h,tx96a,u0fbt,u5ghe,wcnoy,wpok6,wvk2n,xqhhy,y2i17,ydda7
0k2cq,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0tsou,0.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2zt0y,0.0,0.4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
306fp,0.2,0.4,0.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36mab,0.2,0.4,0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4718r,0.6,0.2,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4snwv,0.5,0.2,0.0,0.4,0.1,0.1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4x99y,0.0,0.3,0.3,0.3,0.3,0.1,0.3,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6062q,0.5,0.5,0.0,0.2,0.3,0.7,0.2,0.1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6xiit,0.1,0.3,0.2,0.0,0.2,0.4,0.3,0.4,0.5,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
data = []
npdata = pd_complete.to_numpy()
for i in range(len(npdata)):
    for j in range(len(npdata[i])):
        data.append([pd_complete.index[i], pd_complete.columns[j], 10*npdata[i][j]])

In [31]:
data

[['0k2cq', '0k2cq', 10.0],
 ['0k2cq', '0tsou', 0.0],
 ['0k2cq', '2zt0y', 0.0],
 ['0k2cq', '306fp', 0.0],
 ['0k2cq', '36mab', 0.0],
 ['0k2cq', '4718r', 0.0],
 ['0k2cq', '4snwv', 0.0],
 ['0k2cq', '4x99y', 0.0],
 ['0k2cq', '6062q', 0.0],
 ['0k2cq', '6xiit', 0.0],
 ['0k2cq', '75581', 0.0],
 ['0k2cq', '7nrit', 0.0],
 ['0k2cq', '7tb58', 0.0],
 ['0k2cq', '95saa', 0.0],
 ['0k2cq', '97vve', 0.0],
 ['0k2cq', '9fwxw', 0.0],
 ['0k2cq', '9h24q', 0.0],
 ['0k2cq', '9n6v4', 0.0],
 ['0k2cq', '9njqe', 0.0],
 ['0k2cq', 'a9qsd', 0.0],
 ['0k2cq', 'aewa7', 0.0],
 ['0k2cq', 'b4bxs', 0.0],
 ['0k2cq', 'brhrx', 0.0],
 ['0k2cq', 'd2tzi', 0.0],
 ['0k2cq', 'edf28', 0.0],
 ['0k2cq', 'h2aeg', 0.0],
 ['0k2cq', 'hktsr', 0.0],
 ['0k2cq', 'i5rpt', 0.0],
 ['0k2cq', 'ickpq', 0.0],
 ['0k2cq', 'ijhu4', 0.0],
 ['0k2cq', 'm2k0x', 0.0],
 ['0k2cq', 'npuga', 0.0],
 ['0k2cq', 'o1ojm', 0.0],
 ['0k2cq', 'o6p8o', 0.0],
 ['0k2cq', 'o7hgm', 0.0],
 ['0k2cq', 'otwp4', 0.0],
 ['0k2cq', 'rcql6', 0.0],
 ['0k2cq', 'rpwiz', 0.0],
 ['0k2cq', 

In [32]:
from pyecharts import options as opts
from pyecharts.charts import Bar3D

def bar3d_base() -> Bar3D:
    #data = [(i, j, random.randint(0, 12)) for i in range(6) for j in range(24)]
    c = (
        Bar3D()
        .add(
            "",
            data,
            xaxis3d_opts=opts.Axis3DOpts(type_="category"),
            yaxis3d_opts=opts.Axis3DOpts(type_="category"),
            zaxis3d_opts=opts.Axis3DOpts(type_="value"),
        )
        .set_global_opts(
            visualmap_opts=opts.VisualMapOpts(max_=10),
            title_opts=opts.TitleOpts(title="Bar3D-基本示例"),
        )
    )
    return c
c = bar3d_base()
c.render_notebook()

In [33]:
path = '../../project/main/output_s/new_found_relations.csv'
output_path="./new_found_relations.csv"
pd_idx = load_Object(matPath='../../project/main/objects/objectA.mat')
pd_col = load_Object(matPath='../../project/main/objects/objectA.mat')
pd_complete = pd.DataFrame(data=[[0]*50]*50, index = pd_idx, columns=pd_col)
with open(path, "r") as F:
    for i in F.readlines():
        ap = i.split(',')
        
        key_x = ap[0]
        key_y = ap[1]
        value = float(ap[2])
        print(value)
        pd_complete[key_x][key_y] = value
pd_complete

1.0
0.1
0.2
0.2
0.6
0.5
0.5
0.1
0.1
0.1
0.2
0.5
0.1
0.1
0.3
0.3
0.2
0.8
0.3
0.2
0.1
0.3
0.4
0.2
0.4
0.1
0.3
0.1
0.1
0.3
0.1
0.3
0.3
0.1
0.2
0.6
0.3
1.0
0.4
0.4
0.4
0.2
0.2
0.3
0.5
0.3
0.4
0.2
0.3
0.5
0.5
0.2
0.2
0.1
0.1
0.2
0.3
0.4
0.3
0.3
0.2
0.2
0.3
0.2
0.5
0.2
0.4
0.3
0.3
0.2
0.1
0.4
0.4
0.1
0.3
0.3
0.4
0.3
0.4
0.1
0.1
0.4
1.0
0.2
0.3
0.3
0.2
0.3
0.5
0.1
0.2
0.5
0.4
0.7
0.4
0.2
0.2
0.2
0.1
0.3
0.6
0.4
0.1
0.1
0.3
0.3
0.4
0.4
0.2
0.3
0.6
0.3
0.3
0.3
0.3
0.1
0.2
0.6
0.5
0.3
0.3
0.1
0.2
1.0
0.4
0.3
0.2
0.3
0.2
0.2
0.2
0.3
0.1
0.2
0.3
0.2
0.1
0.1
0.1
0.2
0.3
0.3
0.2
0.3
0.2
0.2
0.2
0.5
0.3
0.2
0.4
0.2
0.1
0.3
0.3
0.1
0.2
0.4
0.3
0.1
0.1
0.4
0.2
0.3
0.4
1.0
0.5
0.1
0.3
0.3
0.2
0.3
0.3
0.3
0.4
0.4
0.1
0.2
0.5
0.2
0.6
0.1
0.3
0.1
0.1
0.3
0.1
0.4
0.1
0.1
0.3
0.4
0.2
0.4
0.5
0.3
0.1
0.6
0.3
0.3
0.1
0.2
0.2
0.6
0.2
0.2
0.5
0.4
0.4
0.3
1.0
0.1
0.1
0.7
0.4
0.2
0.2
0.4
0.3
0.1
0.1
0.2
0.4
0.3
0.1
0.5
0.4
0.2
0.2
0.2
0.3
0.1
0.1
0.4
0.1
0.1
0.2
0.2
0.5
0.2
0.1
0.2
0.3
0.1
0.3
0.6
0.3
0.1
0.6
0.3


Unnamed: 0,0k2cq,0tsou,2zt0y,306fp,36mab,4718r,4snwv,4x99y,6062q,6xiit,...,t3l4h,tx96a,u0fbt,u5ghe,wcnoy,wpok6,wvk2n,xqhhy,y2i17,ydda7
0k2cq,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0tsou,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2zt0y,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
306fp,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36mab,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4718r,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4snwv,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4x99y,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6062q,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6xiit,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
pd_idx = load_Object(matPath='../../project/main/objects/objectA.mat')
pd_col = load_Object(matPath='../../project/main/objects/objectC.mat')
pd_complete = pd.DataFrame(data=[[0]*40]*50, index = pd_idx, columns=pd_col)
pd_complete

Unnamed: 0,3011c,35dzg,3dwuj,3uj2g,4tyje,5p48n,68sz2,6x9fs,8u8bg,9beiu,...,skwpt,sxmxm,tkb4v,umlay,v1mgu,v6ssj,w2pbs,w7uy9,wfqyk,y0sxb
0k2cq,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0tsou,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2zt0y,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
306fp,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36mab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4718r,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4snwv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4x99y,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6062q,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6xiit,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
pd_complete['3011c']['xqhhy'] = 999
pd_complete

Unnamed: 0,3011c,35dzg,3dwuj,3uj2g,4tyje,5p48n,68sz2,6x9fs,8u8bg,9beiu,...,skwpt,sxmxm,tkb4v,umlay,v1mgu,v6ssj,w2pbs,w7uy9,wfqyk,y0sxb
0k2cq,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0tsou,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2zt0y,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
306fp,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36mab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4718r,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4snwv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4x99y,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6062q,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6xiit,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
pd_complete.columns

Index(['3011c', '35dzg', '3dwuj', '3uj2g', '4tyje', '5p48n', '68sz2', '6x9fs',
       '8u8bg', '9beiu', 'a0dw2', 'a7ax2', 'ak369', 'b35hv', 'b512p', 'cfic4',
       'dvzui', 'dyfta', 'e3nin', 'enze2', 'fwtg1', 'g0hd6', 'g0l04', 'g1ll0',
       'hdwab', 'hea7z', 'hgl4s', 'jaoa4', 'pixsv', 'rpl9j', 'skwpt', 'sxmxm',
       'tkb4v', 'umlay', 'v1mgu', 'v6ssj', 'w2pbs', 'w7uy9', 'wfqyk', 'y0sxb'],
      dtype='object')

In [37]:
float('0.1')

0.1