In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import os, sys, glob, inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib as mc
from importlib import reload
reload(mc)


<module 'epri_mc_lib' from '/home/marie-anne/code/Oct20_EPRI/Task1/NB/epri_mc_lib.py'>

## Generate synthetic data

This notebook generates synthetic data from sample measurements to give a measure of the uncertainty of the data. When drawing the new samples the features are assumed to be independent. A new sample for a given condition is created by drawing a value for each feature from a normal distribution with the mean and standard deviation of that feature for that condition. 1000 synthetic samples are created for each condition.

Output file is saved in Data/Merged_data/ALL_TUBE_PIPE_simulated.csv

In [2]:
data_path = "../../Data/Merged_data"
merged_data = pd.read_csv(os.path.join(data_path, 'ALL_TUBE_PIPE_merge_1.csv'), 
                 index_col=0)


### Normality test :  D'Agostino & Pearson test

In [14]:
test_data = merged_data[mc.data_generation_values]

In [35]:
test_data.describe()

Unnamed: 0,TEP_mean_uV_C,Absorption_avg_500,backscatter_avg,Absorption_avg_50,A,B,p,Absorption_avg_100,Absorption_avg_200,mean_CF,mean_perm,mean_MBN
count,24.0,16.0,24.0,24.0,24.0,24.0,24.0,24.0,16.0,24.0,24.0,24.0
mean,12.444946,0.000356,0.022125,0.001851,2.139557e-07,0.000878,0.205742,0.000847,0.000528,1.757909,77.130824,0.511191
std,0.882849,0.000108,0.005491,0.000614,5.191123e-07,0.000598,0.111202,0.000182,0.000166,1.35994,29.73329,0.166885
min,10.763517,0.00023,0.0134,0.000933,2.01e-13,0.000249,0.0595,0.000571,0.000349,0.63275,32.909091,0.299
25%,12.114038,0.000273,0.018125,0.001373,4.26e-13,0.000322,0.0852,0.000709,0.0004,0.974969,61.265909,0.403406
50%,12.926379,0.00034,0.0226,0.001747,1.1521e-10,0.000785,0.2135,0.00081,0.000482,1.110227,68.345455,0.504216
75%,12.985953,0.000415,0.0257,0.002148,9.2825e-08,0.001032,0.26475,0.000983,0.000657,1.946477,103.323864,0.596591
max,13.169977,0.000571,0.0359,0.003377,2.07e-06,0.00244,0.495,0.001287,0.000874,4.741727,133.4,0.904727


In [37]:
norm = pd.DataFrame(stats.normaltest(test_data)).transpose()
norm.set_index(test_data.columns.values, inplace=True)
norm.columns = ['stats', 'p-value']
norm['passed'] = np.where(norm.)
norm.drop(['Absorption_avg_500', 'Absorption_avg_200'])

Unnamed: 0,stats,p-value
TEP_mean_uV_C,5.478747,0.0646108
backscatter_avg,1.262753,0.5318593
Absorption_avg_50,4.140409,0.12616
A,33.293372,5.894353e-08
B,6.720351,0.03472916
p,2.150105,0.3412799
Absorption_avg_100,1.219037,0.5436124
mean_CF,10.824257,0.004462132
mean_perm,1.701602,0.4270728
mean_MBN,4.213113,0.1216562


Every pvalue under 0.05 is not normally distributed, this might be due to our small number of sample. So we can consider that most of our data are normally distributed

In [36]:
num_samples_generated = 1000

For now we're throwing out the pipe data and only working with tubes.

In [37]:
tube, pipe, tube_wo_blind, tube_blind = mc.get_subsample_df(merged_data)

In [39]:
final_generated_data = pd.DataFrame(columns = ["Condition"] + mc.data_generation_values)

for index, row in tube.iterrows():
    generated_samples = pd.DataFrame(num_samples_generated * [index])
    generated_samples.columns = ["Condition"]
    for value_col, std_col in zip(mc.data_generation_values, mc.data_generation_stds):
        generated_samples[value_col] = pd.DataFrame(np.random.normal(tube.loc[index,value_col], 
              tube.loc[index,std_col], num_samples_generated))
    final_generated_data = final_generated_data.append(generated_samples, ignore_index = True)

print(final_generated_data)

      Condition  TEP_mean_uV_C  Absorption_avg_500  backscatter_avg  \
0          T_B1      12.887224            0.000252         0.020317   
1          T_B1      12.904167            0.000194         0.011580   
2          T_B1      12.848012            0.000247         0.027738   
3          T_B1      12.968064            0.000210         0.019859   
4          T_B1      12.927068            0.000233         0.013641   
...         ...            ...                 ...              ...   
15995   T_HAZ_T      12.948355            0.000329         0.020244   
15996   T_HAZ_T      12.940969            0.000244         0.022487   
15997   T_HAZ_T      13.038192            0.000599         0.029502   
15998   T_HAZ_T      13.044431            0.000271         0.020341   
15999   T_HAZ_T      12.895002            0.000371         0.020657   

       Absorption_avg_50             A         B         p  \
0               0.001931  1.202073e-07  0.001092  0.221759   
1               0.00112

In [40]:
final_generated_data.to_csv(os.path.join(data_path, 'ALL_TUBE_PIPE_simulated.csv'), index=False)