In [1]:
import torch
from denoising_diffusion_pytorch import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sdmetrics.reports.single_table import QualityReport

## Load Real data and reshape it into Data Frame

In [2]:
num_testsub = 32
num_classes = 4
num_channel = 40
num_datapoints = 8064
num_trials = 40
sampling_rate = 128 # 128Hz as given in the data

def loadfiles_normalized():
    data_dict = {}
    print("Loading files into data_dict .................")
    for i in range(num_testsub):
                    if i < 10:
                        name = '%0*d' % (2,i+1)
                    else:
                        name = i+1
                    fname = 'data/data_prepared/data_norm_bhat/noramlized_datasub'+str(name) +'.npy'
                    data_dict["sub%s" %name] = np.load(fname)    
    print("Loaded!!!!!") 
    return data_dict

In [3]:
real_data = loadfiles_normalized()

Loading files into data_dict .................
Loaded!!!!!


In [4]:
real_data['sub01'].shape

(40, 40, 99)

In [5]:
value_list = []

for sub in real_data.keys():
    flattened = np.reshape(real_data[sub], (40*40*99))
    value_list.append(flattened)

In [6]:
real_data_df = pd.DataFrame(value_list)

In [7]:
new_header = []
for i in range(40):
    for j in range(40):
        for k in range(99):
            string = "v: "+str(i) + " c: "+str(j) + " f: "+str(k)
            new_header.append(string)
len(new_header)

158400

In [8]:
real_data_df.columns = new_header

In [9]:
real_data_df

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,...,v: 39 c: 39 f: 89,v: 39 c: 39 f: 90,v: 39 c: 39 f: 91,v: 39 c: 39 f: 92,v: 39 c: 39 f: 93,v: 39 c: 39 f: 94,v: 39 c: 39 f: 95,v: 39 c: 39 f: 96,v: 39 c: 39 f: 97,v: 39 c: 39 f: 98
0,0.296331,0.515293,0.050096,0.621925,0.006347,0.004862,0.190134,0.243162,0.321699,0.366093,...,0.453795,0.894851,0.871873,0.7074849,1.0,0.350297,0.162679,0.401734,0.460576,0.300305
1,0.404434,0.18999,0.419752,0.59884,0.398338,0.211852,0.412063,0.105966,0.078021,0.030269,...,0.066972,0.558482,0.579885,0.08195843,0.879585,0.049699,0.009878,0.048077,0.799947,0.549632
2,0.828072,0.819458,0.54279,0.401994,0.314889,0.231296,0.590298,0.255228,0.659133,0.675434,...,0.121549,0.471396,0.502491,0.3466367,0.779081,0.423684,0.207962,0.446184,0.627415,0.334354
3,0.214855,0.385924,0.178468,0.717478,0.095788,0.022844,0.230425,0.658926,0.44702,0.640799,...,0.054549,0.902025,0.74119,1.0,0.96205,1.0,1.0,0.891247,0.688877,0.165492
4,0.266291,0.221836,0.674663,0.198803,0.87441,0.828868,0.869217,0.049139,0.537011,0.378504,...,0.101748,0.135571,0.193325,0.02465668,0.30646,0.250856,0.09413,0.253456,0.252769,0.464888
5,0.359303,0.477812,0.402078,0.87534,0.258409,0.170151,0.248626,0.6525,0.200815,0.54007,...,0.150267,0.660187,0.682345,0.5446782,0.71237,0.375002,0.19648,0.543417,0.586925,0.335171
6,0.376767,0.259169,0.170159,0.890803,0.211211,0.159174,0.129994,0.637034,0.078304,0.326829,...,0.03177,0.277559,0.28264,0.08722051,0.670414,0.165739,0.057048,0.179458,0.502468,0.328131
7,0.644092,0.331068,0.24193,0.747807,0.388606,0.230884,0.247166,0.655118,0.226427,0.94394,...,0.149307,0.353966,0.43383,0.0,0.69037,0.178668,0.047491,0.192399,0.319983,0.040583
8,0.374363,0.633659,0.123018,0.319657,0.293157,0.240697,0.406652,0.0,0.51711,0.503185,...,0.315218,0.722555,0.663177,0.6162926,0.999857,0.599101,0.376251,0.583051,0.33635,0.104397
9,0.617795,0.499037,0.205373,0.773152,0.00113,0.000708,0.118027,0.38854,0.777014,0.561976,...,0.148524,0.881979,0.878573,0.2936247,0.996479,0.021801,0.001267,0.026144,0.001705,0.417478


In [40]:
real_data_df_short = real_data_df.iloc[:,:100]

## Get Diffusion Model data

In [12]:
arrays = []
for i in range(10):
    arrays.append(torch.load("data/sampled/0xx/1_sample/sampled_03_128_"+str(i),map_location=torch.device('cpu')))
generated_data = np.concatenate(arrays, axis=0)
generated_data.shape

(1280, 1, 40, 104)

In [13]:
X_argument2 = np.zeros((1280,40,99))
X_argument2[:] = np.nan

for i in range(generated_data.shape[0]):
    X_argument2[i] = generated_data[i][0][:,:99] 

In [14]:
X_argument2.shape

(1280, 40, 99)

In [15]:
np.isnan(X_argument2).any()

False

In [16]:
list_of_generated = []
for i in range(32):
    sub = X_argument2[40*i:40*(i+1),:,:]
    flattened = np.reshape(sub, (40*40*99))
    list_of_generated.append(flattened)

In [17]:
fake_data_df = pd.DataFrame(list_of_generated)

In [18]:
fake_data_df.columns = new_header

In [19]:
fake_data_df

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,...,v: 39 c: 39 f: 89,v: 39 c: 39 f: 90,v: 39 c: 39 f: 91,v: 39 c: 39 f: 92,v: 39 c: 39 f: 93,v: 39 c: 39 f: 94,v: 39 c: 39 f: 95,v: 39 c: 39 f: 96,v: 39 c: 39 f: 97,v: 39 c: 39 f: 98
0,0.733852,0.343186,0.088459,0.753658,0.362293,0.30486,0.174437,0.262279,0.111394,0.592281,...,0.037418,0.643337,0.59477,0.029524,0.890385,0.110495,0.018553,0.103027,0.388782,0.212364
1,0.024799,0.01125,0.004615,0.00978,0.013897,0.413496,0.420211,0.674088,0.34719,0.483569,...,0.007046,0.074937,0.667999,0.46308,0.560561,0.541321,0.13789,0.555195,0.116401,0.016055
2,0.424602,0.253171,0.36883,0.544676,0.240357,0.139282,0.445463,0.165211,0.495048,0.411101,...,0.027368,0.100802,0.762613,0.670729,0.698838,0.724405,0.338863,0.414681,0.213862,0.053068
3,0.477172,0.590274,0.367661,0.510473,0.375712,0.35618,0.518964,0.530685,0.232827,0.53045,...,0.101439,0.800492,0.840869,0.435627,0.916294,0.211123,0.063157,0.237072,0.434425,0.283218
4,0.575826,0.529352,0.327633,0.795009,0.221314,0.142876,0.214495,0.343678,0.367757,0.551462,...,0.282768,0.726583,0.783767,0.238968,0.876613,0.221222,0.061891,0.309779,0.219576,0.192812
5,0.162003,0.642895,0.389946,0.550602,0.494039,0.379164,0.473417,0.784579,0.195569,0.164895,...,0.067274,0.258463,0.557187,0.320441,0.457299,0.443282,0.58406,0.668902,0.291084,0.081102
6,0.311722,0.150597,0.139811,0.755625,0.168347,0.110324,0.216445,0.314248,0.425659,0.060236,...,0.094414,0.269831,0.599096,0.390099,0.499528,0.492199,0.648004,0.711658,0.172476,0.026784
7,0.5864,0.367159,0.128309,0.915739,0.327844,0.3217,0.102924,0.323957,0.094707,0.544067,...,0.138339,0.457057,0.562508,0.080119,0.519214,0.290063,0.135363,0.360029,0.378953,0.168473
8,0.694918,0.705028,0.703607,0.446976,0.521182,0.372679,0.637911,0.702408,0.51523,0.561622,...,0.148788,0.279209,0.836245,0.717514,0.765234,0.756011,0.647154,0.701792,0.193318,0.036311
9,0.00572,0.0,0.00237,0.004701,0.004462,0.051563,0.564516,0.205992,0.400711,0.463676,...,0.006146,0.07173,0.68341,0.497962,0.703197,0.686806,0.360726,0.446419,0.09747,0.020065


In [21]:
real_data_df.equals(fake_data_df)

False

In [41]:
fake_data_df_short = fake_data_df.iloc[:,:100]
fake_data_df_short

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,...,v: 0 c: 0 f: 90,v: 0 c: 0 f: 91,v: 0 c: 0 f: 92,v: 0 c: 0 f: 93,v: 0 c: 0 f: 94,v: 0 c: 0 f: 95,v: 0 c: 0 f: 96,v: 0 c: 0 f: 97,v: 0 c: 0 f: 98,v: 0 c: 1 f: 0
0,0.733852,0.343186,0.088459,0.753658,0.362293,0.30486,0.174437,0.262279,0.111394,0.592281,...,0.595691,0.263992,0.363264,0.79562,0.621172,0.60869,0.296779,0.486603,0.130183,0.557489
1,0.024799,0.01125,0.004615,0.00978,0.013897,0.413496,0.420211,0.674088,0.34719,0.483569,...,0.342058,0.511486,0.516415,0.382934,0.571781,0.396968,0.655751,0.422823,0.162858,0.0
2,0.424602,0.253171,0.36883,0.544676,0.240357,0.139282,0.445463,0.165211,0.495048,0.411101,...,0.511482,0.45175,0.182059,0.622425,0.273469,0.244304,0.293578,0.099673,0.218134,0.447487
3,0.477172,0.590274,0.367661,0.510473,0.375712,0.35618,0.518964,0.530685,0.232827,0.53045,...,0.649599,0.465366,0.568765,0.47367,0.86174,0.784621,0.586531,0.576573,0.108337,0.530914
4,0.575826,0.529352,0.327633,0.795009,0.221314,0.142876,0.214495,0.343678,0.367757,0.551462,...,0.617135,0.549236,0.077022,0.954478,0.059268,0.045746,0.043821,0.6357,0.145318,0.542271
5,0.162003,0.642895,0.389946,0.550602,0.494039,0.379164,0.473417,0.784579,0.195569,0.164895,...,0.32343,0.248542,0.485836,0.515591,0.578164,0.527362,0.739379,0.751813,0.297468,0.167679
6,0.311722,0.150597,0.139811,0.755625,0.168347,0.110324,0.216445,0.314248,0.425659,0.060236,...,0.323463,0.180612,0.034355,0.79649,0.267478,0.192136,0.10938,0.452775,0.105138,0.527951
7,0.5864,0.367159,0.128309,0.915739,0.327844,0.3217,0.102924,0.323957,0.094707,0.544067,...,0.745731,0.843156,0.051924,0.971905,0.147568,0.059633,0.042217,0.552886,0.020041,0.688868
8,0.694918,0.705028,0.703607,0.446976,0.521182,0.372679,0.637911,0.702408,0.51523,0.561622,...,0.539498,0.455147,0.452065,0.689433,0.167065,0.115494,0.369498,0.529481,0.622066,0.344883
9,0.00572,0.0,0.00237,0.004701,0.004462,0.051563,0.564516,0.205992,0.400711,0.463676,...,0.241913,0.755533,0.312341,0.470498,0.501496,0.222526,0.498134,0.305021,0.17134,0.0


-----------

### Quality Report

In [42]:
real_data_df_short['id'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32]
fake_data_df_short['id'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32]
#fake_data_df2['id'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32]

In [60]:
text = ""
#text = '''{
 #   "primary_key": "id",
 #   "fields": { 
 #   '''
for i in range(40):
    for j in range(40):
        for k in range(99):
            string = "v: "+str(i) + " c: "+str(j) + " f: "+str(k)
            text = text + '"'+string+'"'+''': {
            "type": "numerical",
            "subtype": "float"
            },'''

In [61]:
#text
with open("my_document.txt", "w") as f:
    f.write(text)

In [43]:
import json

with open('my_document_short.json', 'r') as f:
    my_dict = json.load(f)

In [32]:
fake_data_df_short

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,id
0,0.733852,0.343186,0.088459,0.753658,0.362293,0.30486,0.174437,0.262279,0.111394,0.592281,1
1,0.024799,0.01125,0.004615,0.00978,0.013897,0.413496,0.420211,0.674088,0.34719,0.483569,2
2,0.424602,0.253171,0.36883,0.544676,0.240357,0.139282,0.445463,0.165211,0.495048,0.411101,3
3,0.477172,0.590274,0.367661,0.510473,0.375712,0.35618,0.518964,0.530685,0.232827,0.53045,4
4,0.575826,0.529352,0.327633,0.795009,0.221314,0.142876,0.214495,0.343678,0.367757,0.551462,5
5,0.162003,0.642895,0.389946,0.550602,0.494039,0.379164,0.473417,0.784579,0.195569,0.164895,6
6,0.311722,0.150597,0.139811,0.755625,0.168347,0.110324,0.216445,0.314248,0.425659,0.060236,7
7,0.5864,0.367159,0.128309,0.915739,0.327844,0.3217,0.102924,0.323957,0.094707,0.544067,8
8,0.694918,0.705028,0.703607,0.446976,0.521182,0.372679,0.637911,0.702408,0.51523,0.561622,9
9,0.00572,0.0,0.00237,0.004701,0.004462,0.051563,0.564516,0.205992,0.400711,0.463676,10


In [30]:
real_data_df_short

Unnamed: 0,v: 0 c: 0 f: 0,v: 0 c: 0 f: 1,v: 0 c: 0 f: 2,v: 0 c: 0 f: 3,v: 0 c: 0 f: 4,v: 0 c: 0 f: 5,v: 0 c: 0 f: 6,v: 0 c: 0 f: 7,v: 0 c: 0 f: 8,v: 0 c: 0 f: 9,id
0,0.296331,0.515293,0.050096,0.621925,0.006347,0.004862,0.190134,0.243162,0.321699,0.366093,1
1,0.404434,0.18999,0.419752,0.59884,0.398338,0.211852,0.412063,0.105966,0.078021,0.030269,2
2,0.828072,0.819458,0.54279,0.401994,0.314889,0.231296,0.590298,0.255228,0.659133,0.675434,3
3,0.214855,0.385924,0.178468,0.717478,0.095788,0.022844,0.230425,0.658926,0.44702,0.640799,4
4,0.266291,0.221836,0.674663,0.198803,0.87441,0.828868,0.869217,0.049139,0.537011,0.378504,5
5,0.359303,0.477812,0.402078,0.87534,0.258409,0.170151,0.248626,0.6525,0.200815,0.54007,6
6,0.376767,0.259169,0.170159,0.890803,0.211211,0.159174,0.129994,0.637034,0.078304,0.326829,7
7,0.644092,0.331068,0.24193,0.747807,0.388606,0.230884,0.247166,0.655118,0.226427,0.94394,8
8,0.374363,0.633659,0.123018,0.319657,0.293157,0.240697,0.406652,0.0,0.51711,0.503185,9
9,0.617795,0.499037,0.205373,0.773152,0.00113,0.000708,0.118027,0.38854,0.777014,0.561976,10


In [44]:
report = QualityReport()
report.generate(real_data_df_short, fake_data_df_short, my_dict)

Creating report: 100%|████████████████████████████| 4/4 [00:24<00:00,  6.10s/it]



Overall Quality Score: 77.08%

Properties:
Column Shapes: 68.25%
Column Pair Trends: 85.91%


In [45]:
report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,v: 0 c: 0 f: 0,KSComplement,0.40625
1,v: 0 c: 0 f: 1,KSComplement,0.46875
2,v: 0 c: 0 f: 2,KSComplement,0.50000
3,v: 0 c: 0 f: 3,KSComplement,0.43750
4,v: 0 c: 0 f: 4,KSComplement,0.53125
...,...,...,...
95,v: 0 c: 0 f: 95,KSComplement,0.75000
96,v: 0 c: 0 f: 96,KSComplement,0.62500
97,v: 0 c: 0 f: 97,KSComplement,0.68750
98,v: 0 c: 0 f: 98,KSComplement,0.75000


In [46]:
report.get_visualization(property_name='Column Shapes')

In [48]:
from sdmetrics.reports.utils import get_column_plot

fig = get_column_plot(
    real_data=real_data_df_short,
    synthetic_data=fake_data_df_short,
    metadata=my_dict,
    column_name='v: 0 c: 0 f: 85',
)

fig.show()

In [None]:
my_report.save(filepath='demo_data_quality_report.pkl')

# load it at any point in the future
my_report = QualityReport.load(filepath='demo_data_quality_report.pkl')