Generate data from ramdom gaussian sampling

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import os, sys, glob, inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import epri_mc_lib as mc
from importlib import reload
reload(mc)


## Generate synthetic data

This notebook generates synthetic data from sample measurements to give a measure of the uncertainty of the data. When drawing the new samples the features are assumed to be independent. A new sample for a given condition is created by drawing a value for each feature from a normal distribution with the mean and standard deviation of that feature for that condition. 1000 synthetic samples are created for each condition.

Output file is saved in Data/Merged_data/ALL_TUBE_PIPE_simulated.csv

In [None]:
data_path = "../../Data/Merged_data"
merged_data = pd.read_csv(os.path.join(data_path, 'ALL_TUBE_PIPE_merge_1.csv'), 
                 index_col=0)


### Normality test :  D'Agostino & Pearson test

In [None]:
test_data = merged_data[mc.data_generation_values]

In [None]:
test_data.describe()

In [None]:
norm = pd.DataFrame(stats.normaltest(test_data)).transpose()
norm.set_index(test_data.columns.values, inplace=True)
norm.columns = ['stats', 'p-value']
norm['passed'] = np.where(norm['p-value'] >= 0.05, 'Yes', 'No')
norm.drop(['Absorption_avg_500', 'Absorption_avg_200'])

Every pvalue under 0.05 is not normally distributed, this might be due to our small number of sample. So we can consider that most of our data are normally distributed

In [None]:
num_samples_generated = 1000

For now we're throwing out the pipe data and only working with tubes.

In [None]:
tube, pipe, tube_wo_blind, tube_blind = mc.get_subsample_df(merged_data)

In [None]:
final_generated_data = pd.DataFrame(columns = ["Condition"] + mc.data_generation_values)

for index, row in tube.iterrows():
    generated_samples = pd.DataFrame(num_samples_generated * [index])
    generated_samples.columns = ["Condition"]
    for value_col, std_col in zip(mc.data_generation_values, mc.data_generation_stds):
        generated_samples[value_col] = pd.DataFrame(np.random.normal(tube.loc[index,value_col], 
              tube.loc[index,std_col], num_samples_generated))
    final_generated_data = final_generated_data.append(generated_samples, ignore_index = True)

print(final_generated_data)

In [None]:
final_generated_data.to_csv(os.path.join(data_path, 'ALL_TUBE_PIPE_simulated.csv'), index=False)