In [36]:
import torch
from denoising_diffusion_pytorch import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sdmetrics.reports.single_table import QualityReport

## Load Real data and reshape it into Data Frame

In [2]:
num_testsub = 32
num_classes = 4
num_channel = 40
num_datapoints = 8064
num_trials = 40
sampling_rate = 128 # 128Hz as given in the data

def loadfiles_normalized():
    data_dict = {}
    print("Loading files into data_dict .................")
    for i in range(num_testsub):
                    if i < 10:
                        name = '%0*d' % (2,i+1)
                    else:
                        name = i+1
                    fname = 'data/data_prepared/data_norm_bhat/noramlized_datasub'+str(name) +'.npy'
                    data_dict["sub%s" %name] = np.load(fname)    
    print("Loaded!!!!!") 
    return data_dict

In [3]:
real_data = loadfiles_normalized()

Loading files into data_dict .................
Loaded!!!!!


In [4]:
real_data['sub01'].shape

(40, 40, 99)

In [5]:
value_list = []

for sub in real_data.keys():
    flattened = np.reshape(real_data[sub], (40*40*99))
    value_list.append(flattened)

In [6]:
real_data_df = pd.DataFrame(value_list)

In [7]:
new_header = []
for i in range(40):
    for j in range(40):
        for k in range(99):
            string = "v: "+str(i) + " c: "+str(j) + " f: "+str(k)
            new_header.append(string)
len(new_header)

158400

In [8]:
real_data_df.columns = new_header

In [9]:
real_data_df

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,...,v: 39 c: 39 f: 89,v: 39 c: 39 f: 90,v: 39 c: 39 f: 91,v: 39 c: 39 f: 92,v: 39 c: 39 f: 93,v: 39 c: 39 f: 94,v: 39 c: 39 f: 95,v: 39 c: 39 f: 96,v: 39 c: 39 f: 97,v: 39 c: 39 f: 98
0,0.296331,0.515293,0.050096,0.621925,0.006347,0.004862,0.190134,0.243162,0.321699,0.366093,...,0.453795,0.894851,0.871873,0.7074849,1.0,0.350297,0.162679,0.401734,0.460576,0.300305
1,0.404434,0.18999,0.419752,0.59884,0.398338,0.211852,0.412063,0.105966,0.078021,0.030269,...,0.066972,0.558482,0.579885,0.08195843,0.879585,0.049699,0.009878,0.048077,0.799947,0.549632
2,0.828072,0.819458,0.54279,0.401994,0.314889,0.231296,0.590298,0.255228,0.659133,0.675434,...,0.121549,0.471396,0.502491,0.3466367,0.779081,0.423684,0.207962,0.446184,0.627415,0.334354
3,0.214855,0.385924,0.178468,0.717478,0.095788,0.022844,0.230425,0.658926,0.44702,0.640799,...,0.054549,0.902025,0.74119,1.0,0.96205,1.0,1.0,0.891247,0.688877,0.165492
4,0.266291,0.221836,0.674663,0.198803,0.87441,0.828868,0.869217,0.049139,0.537011,0.378504,...,0.101748,0.135571,0.193325,0.02465668,0.30646,0.250856,0.09413,0.253456,0.252769,0.464888
5,0.359303,0.477812,0.402078,0.87534,0.258409,0.170151,0.248626,0.6525,0.200815,0.54007,...,0.150267,0.660187,0.682345,0.5446782,0.71237,0.375002,0.19648,0.543417,0.586925,0.335171
6,0.376767,0.259169,0.170159,0.890803,0.211211,0.159174,0.129994,0.637034,0.078304,0.326829,...,0.03177,0.277559,0.28264,0.08722051,0.670414,0.165739,0.057048,0.179458,0.502468,0.328131
7,0.644092,0.331068,0.24193,0.747807,0.388606,0.230884,0.247166,0.655118,0.226427,0.94394,...,0.149307,0.353966,0.43383,0.0,0.69037,0.178668,0.047491,0.192399,0.319983,0.040583
8,0.374363,0.633659,0.123018,0.319657,0.293157,0.240697,0.406652,0.0,0.51711,0.503185,...,0.315218,0.722555,0.663177,0.6162926,0.999857,0.599101,0.376251,0.583051,0.33635,0.104397
9,0.617795,0.499037,0.205373,0.773152,0.00113,0.000708,0.118027,0.38854,0.777014,0.561976,...,0.148524,0.881979,0.878573,0.2936247,0.996479,0.021801,0.001267,0.026144,0.001705,0.417478


## Get Diffusion Model data

In [10]:
new_data = torch.load("data/psd_427")
new_data = new_data.numpy()
new_data.shape

(427, 3, 40, 104)

In [11]:
X_argument = np.zeros((1280,40,99))
X_argument[:] = np.nan
counter = 0
for i in range(427):
    for j in range(3):
        if counter == 1280:
            break
        else:
            temp = new_data[i][j][:,:99]
            X_argument[counter] = new_data[i][j][:,:99]
            counter = counter +1

In [12]:
X_argument.shape

(1280, 40, 99)

In [13]:
np.isnan(X_argument).any()

False

In [14]:
list_of_generated = []
for i in range(32):
    sub = X_argument[40*i:40*(i+1),:,:]
    flattened = np.reshape(sub, (40*40*99))
    list_of_generated.append(flattened)

In [15]:
fake_data_df = pd.DataFrame(list_of_generated)

In [16]:
fake_data_df.columns = new_header

In [17]:
fake_data_df

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,...,v: 39 c: 39 f: 89,v: 39 c: 39 f: 90,v: 39 c: 39 f: 91,v: 39 c: 39 f: 92,v: 39 c: 39 f: 93,v: 39 c: 39 f: 94,v: 39 c: 39 f: 95,v: 39 c: 39 f: 96,v: 39 c: 39 f: 97,v: 39 c: 39 f: 98
0,0.976643,0.993014,1.0,0.997484,1.0,0.987067,0.938026,0.677717,0.619775,0.982678,...,0.893511,0.414522,0.079002,1.0,0.126366,0.964194,0.970027,0.009656,0.982477,0.721222
1,0.001613,0.190766,0.477001,0.020785,0.557303,0.061821,0.0,0.238534,0.0,0.0,...,0.0,0.192452,0.069972,0.338241,0.971242,0.317579,0.314694,0.673617,0.001365,0.03177
2,0.397337,0.609739,0.693979,1.0,1.0,0.920468,1.0,0.931353,0.997428,1.0,...,0.767139,0.990858,0.648488,0.993892,0.923633,0.030495,0.940007,1.0,0.15722,0.413028
3,0.746036,0.0,0.617758,0.797527,0.896598,0.979618,0.85978,0.794991,0.988442,0.068678,...,0.864476,1.0,0.960162,1.0,0.461711,0.710442,0.862543,1.0,0.771428,0.249327
4,0.859731,0.081609,0.0,0.932499,0.959159,0.020763,0.051214,0.0,0.365595,0.345883,...,0.231651,0.019552,0.682779,0.209267,0.149815,0.0,0.0,0.863967,0.014278,0.352759
5,0.0,0.993174,1.0,0.997066,0.127932,0.473215,0.895248,0.455474,0.753693,0.449576,...,0.971273,0.847837,0.872194,0.21846,0.632193,0.23365,0.214556,0.274014,0.215202,0.440135
6,0.181472,1.0,1.0,1.0,1.0,0.849485,0.794019,1.0,0.567044,0.974418,...,0.619782,1.0,0.020894,0.0,0.98831,0.966576,0.108696,0.085896,0.77454,0.227413
7,0.416304,0.0,0.0,0.0,0.083621,0.0,0.069789,0.923192,0.196575,0.92767,...,0.612776,0.955384,0.0,0.50536,0.75204,0.306898,0.892223,0.010861,0.0,0.108682
8,0.091203,1.0,0.909189,0.535938,0.0,0.715631,0.521513,0.027555,0.059156,0.119631,...,0.140833,0.028165,0.479289,0.993594,0.977019,0.920485,0.99045,0.381852,0.933166,0.296738
9,0.227096,1.0,1.0,0.938236,0.942852,0.977719,0.292398,1.0,0.966863,0.136163,...,0.925441,0.994452,1.0,0.992732,0.425303,0.925966,0.954055,0.906457,0.804986,0.885404


In [18]:
real_data_df.equals(fake_data_df)

False

------------

In [19]:
arrays = []
for i in range(10):
    arrays.append(torch.load("data/sampled_40-104_43_"+str(i)))
generated_data = np.concatenate(arrays, axis=0)

In [20]:
generated_data.shape

(430, 3, 40, 104)

In [21]:
X_argument2 = np.zeros((1280,40,99))
X_argument2[:] = np.nan
counter = 0
for i in range(430):
    for j in range(3):
        if counter == 1280:
            break
        else:
            temp = generated_data[i][j][:,:99]
            X_argument2[counter] = generated_data[i][j][:,:99]
            counter = counter +1

In [22]:
X_argument2.shape

(1280, 40, 99)

In [31]:
list_of_generated = []
for i in range(32):
    sub = X_argument2[40*i:40*(i+1),:,:]
    flattened = np.reshape(sub, (40*40*99))
    list_of_generated.append(flattened)

In [32]:
fake_data_df2 = pd.DataFrame(list_of_generated)
fake_data_df2.columns = new_header

In [33]:
fake_data_df2

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,...,v: 39 c: 39 f: 89,v: 39 c: 39 f: 90,v: 39 c: 39 f: 91,v: 39 c: 39 f: 92,v: 39 c: 39 f: 93,v: 39 c: 39 f: 94,v: 39 c: 39 f: 95,v: 39 c: 39 f: 96,v: 39 c: 39 f: 97,v: 39 c: 39 f: 98
0,0.622382,0.457843,0.26896,1.0,1.0,0.446993,1.0,0.911681,0.850652,1.0,...,0.930889,0.989919,0.213231,0.99127,0.47119,0.971964,0.547555,0.096531,1.0,0.058844
1,0.487144,0.001816,0.0,0.238453,0.115433,0.032708,0.064115,0.0,0.938576,0.0,...,0.001582,0.654127,0.616023,0.144119,0.008735,0.291191,0.974992,0.61838,0.0,0.147036
2,0.368904,0.345423,0.561265,0.980966,0.045494,0.41114,0.12523,0.818877,0.995227,0.126643,...,0.841424,0.966301,0.002273,0.864831,1.0,0.825442,0.029381,0.998315,0.539677,0.413593
3,1.0,0.980487,1.0,0.015152,0.991043,0.699763,0.0,0.114241,1.0,0.98666,...,0.199073,1.0,0.164827,0.791461,0.898058,0.648606,0.164694,0.925163,0.03736,0.025757
4,0.006004,0.990656,0.383899,0.017197,0.0,0.578482,0.980892,1.0,0.613245,0.084558,...,0.238323,1.0,0.579722,0.88204,0.0,0.0,0.0,0.007224,1.0,0.524067
5,0.232527,0.584095,0.981801,1.0,0.620302,0.300734,0.935965,0.882727,1.0,0.847748,...,0.993632,0.974096,0.154455,0.919331,0.99569,1.0,0.0,0.985674,0.0,0.46187
6,0.68387,0.158584,0.0,0.436096,0.807924,0.950968,0.912343,1.0,1.0,0.62481,...,0.937566,0.956111,0.988859,1.0,0.872794,0.909544,0.836191,0.566572,0.031331,0.83992
7,0.019441,0.0,0.137818,0.07497,0.748437,0.0,0.0,0.737156,0.377064,0.0,...,1.0,0.0,0.147468,0.0,0.565908,0.083889,0.444011,0.007231,0.325348,0.129607
8,0.854078,0.0,0.61214,0.007871,0.029439,0.577676,0.444307,0.98663,1.0,1.0,...,1.0,0.983591,0.984637,0.056106,0.984771,0.972072,1.0,0.040404,0.953094,1.0
9,0.997049,1.0,1.0,0.707863,0.999555,0.655886,1.0,1.0,1.0,0.786285,...,0.997549,1.0,0.033445,1.0,0.959769,0.84257,1.0,0.638864,0.992411,1.0


In [34]:
fake_data_df.equals(fake_data_df2)

False

-----------

### Quality Report

In [46]:
real_data_df['id'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32]
fake_data_df['id'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32]
fake_data_df2['id'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32]

In [60]:
text = ""
#text = '''{
 #   "primary_key": "id",
 #   "fields": { 
 #   '''
for i in range(40):
    for j in range(40):
        for k in range(99):
            string = "v: "+str(i) + " c: "+str(j) + " f: "+str(k)
            text = text + '"'+string+'"'+''': {
            "type": "numerical",
            "subtype": "float"
            },'''

In [61]:
#text
with open("my_document.txt", "w") as f:
    f.write(text)

In [73]:
import json

with open('my_document.json', 'r') as f:
    my_dict = json.load(f)

In [45]:
real_data_df

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,...,v: 39 c: 39 f: 90,v: 39 c: 39 f: 91,v: 39 c: 39 f: 92,v: 39 c: 39 f: 93,v: 39 c: 39 f: 94,v: 39 c: 39 f: 95,v: 39 c: 39 f: 96,v: 39 c: 39 f: 97,v: 39 c: 39 f: 98,id
0,0.296331,0.515293,0.050096,0.621925,0.006347,0.004862,0.190134,0.243162,0.321699,0.366093,...,0.894851,0.871873,0.7074849,1.0,0.350297,0.162679,0.401734,0.460576,0.300305,1
1,0.404434,0.18999,0.419752,0.59884,0.398338,0.211852,0.412063,0.105966,0.078021,0.030269,...,0.558482,0.579885,0.08195843,0.879585,0.049699,0.009878,0.048077,0.799947,0.549632,2
2,0.828072,0.819458,0.54279,0.401994,0.314889,0.231296,0.590298,0.255228,0.659133,0.675434,...,0.471396,0.502491,0.3466367,0.779081,0.423684,0.207962,0.446184,0.627415,0.334354,3
3,0.214855,0.385924,0.178468,0.717478,0.095788,0.022844,0.230425,0.658926,0.44702,0.640799,...,0.902025,0.74119,1.0,0.96205,1.0,1.0,0.891247,0.688877,0.165492,4
4,0.266291,0.221836,0.674663,0.198803,0.87441,0.828868,0.869217,0.049139,0.537011,0.378504,...,0.135571,0.193325,0.02465668,0.30646,0.250856,0.09413,0.253456,0.252769,0.464888,5
5,0.359303,0.477812,0.402078,0.87534,0.258409,0.170151,0.248626,0.6525,0.200815,0.54007,...,0.660187,0.682345,0.5446782,0.71237,0.375002,0.19648,0.543417,0.586925,0.335171,6
6,0.376767,0.259169,0.170159,0.890803,0.211211,0.159174,0.129994,0.637034,0.078304,0.326829,...,0.277559,0.28264,0.08722051,0.670414,0.165739,0.057048,0.179458,0.502468,0.328131,7
7,0.644092,0.331068,0.24193,0.747807,0.388606,0.230884,0.247166,0.655118,0.226427,0.94394,...,0.353966,0.43383,0.0,0.69037,0.178668,0.047491,0.192399,0.319983,0.040583,8
8,0.374363,0.633659,0.123018,0.319657,0.293157,0.240697,0.406652,0.0,0.51711,0.503185,...,0.722555,0.663177,0.6162926,0.999857,0.599101,0.376251,0.583051,0.33635,0.104397,9
9,0.617795,0.499037,0.205373,0.773152,0.00113,0.000708,0.118027,0.38854,0.777014,0.561976,...,0.881979,0.878573,0.2936247,0.996479,0.021801,0.001267,0.026144,0.001705,0.417478,10


In [None]:
report = QualityReport()
report.generate(real_data_df, fake_data_df, my_dict)

Creating report:  50%|█████████████▌             | 2/4 [05:45<04:45, 142.55s/it]

In [None]:
report.get_details(property_name='Column Shapes')

In [None]:
report.get_visualization(property_name='Column Shapes')