In [7]:
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import h5py
from power_spectrum_utils import power_spectrum_np
from wgan_utils import define_test, check_coords


from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [8]:
mean_5=14280.155
std_5=89914.586
max_5=47676240

In [25]:
def get_samples(file, s_sample, nsamples, test_coords):
    #n is size of minibatch, get valid samples (not intersecting with test_coords)
    sample_list=[]
    m=2048-s_sample
    for n in range(nsamples):
        #print("Sample No = " + str(n + 1) + " / " + str(nsamples))
        sample_valid=False
        while sample_valid==False:
            x = random.randint(0,m)
            y = random.randint(0,m)
            z = random.randint(0,m)
            sample_coords = {'x':[x,x+s_sample], 
                             'y':[y,y+s_sample], 
                             'z':[z,z+s_sample]}
            
            sample_valid = check_coords(test_coords, sample_coords)
        
        sample_list.append(sample_coords)
    
    #Load cube and get samples and convert them to np.arrays
    sample_array=[]
    #f file has to be opened outisde the function
    
    for c in sample_list:
        a = f[c['x'][0]:c['x'][1],
              c['y'][0]:c['y'][1],
              c['z'][0]:c['z'][1]]
        
        #Modify here for 2D or 3D
        sample_array.append(np.array(a))
    
    return np.array(sample_array)

def build_df(n_samples):
    col_list = ['mean', 'std', 'median', 'max', 'pk_mean', 'pk_std', 'pk_range', 'below_mean']
    df = pd.DataFrame(index=list(range(n_samples)), columns=col_list)
    no_pixels = s_sample**3
    
    for n in range(n_samples):
        arr=get_samples(0, s_sample, 1, tc)
        arr = arr[0]
        
        k, Pk = power_spectrum_np(arr, mean_5, s_sample)
        
        row = {'mean': arr.mean(), 'std': arr.std(), 'median': np.median(arr) , 'max': np.amax(arr), 
               'pk_mean': Pk.mean(), 'pk_std': Pk.std()  , 'pk_range': Pk[0]-Pk[-1] ,
               'below_mean' : (arr< mean_5).sum() / no_pixels }
               #'above_std' : 1 - ((arr [n]< 3*std_5 ).sum()) / no_pixels }
        df.loc[n] = row
    
    #Standardize
    scaler =  StandardScaler() #MinMaxScaler()
    scaler.fit(df)
    df = scaler.transform(df)
    
    return pd.DataFrame(df, columns=col_list)

In [10]:
redshift='5.0'
s_sample = 64
random.seed(1)
tc=define_test(s_test=5, s_train=64)
datapath='../../../../../'
#mean_, nonz, std_,max_=[],[],[],[]

f = h5py.File(datapath+'fields_z='+redshift+'.hdf5', 'r')
f=f['delta_HI']

In [28]:
data = build_df(20000)
data.to_csv('GMM_df.csv')
data.head()

Unnamed: 0,mean,std,median,max,pk_mean,pk_std,pk_range,below_mean
0,-0.190491,-0.20253,0.118919,-0.318092,-0.155263,-0.156562,-0.152106,-0.028214
1,-0.448558,-0.465971,-1.510861,-0.400546,-0.167124,-0.165004,-0.160199,0.681612
2,-0.201181,-0.180256,-0.264498,-0.256943,-0.146284,-0.142732,-0.137076,0.122095
3,-0.356033,-0.340689,-0.482486,-0.353158,-0.163552,-0.163041,-0.158336,0.461006
4,-0.505492,-0.613394,-1.607464,-0.519299,-0.169537,-0.166862,-0.161786,0.847923


In [29]:
data.shape

(20000, 8)