# LHS is not available on TS machines and will be performed elsewhere

## PORO/PERMX pairs may be sampled more than once

In [8]:
import numpy as np
import pandas as pd
from scipy.stats import qmc
from pathlib import Path
import sys

# --------------------------
# Setup for sampling
# --------------------------
params = ['E_GPa', 'PR', 'SH_MPa/km', 'Sh_MPa/km', 'Sv_MPa/km', 'SH_azi_deg']
l_bounds = [15, 0.2, 25 * 0.9, 14.6 * 0.9, 22.7 * 0.9, 290]
u_bounds = [25, 0.4, 25 * 1.1, 14.6 * 1.1, 22.7 * 1.1, 310]
num_samples = 5  # Change as needed

# --------------------------
# Load PORO and PERMX files
# --------------------------
data_dir = Path('..') / 'data' / 'properties'
# Or use Path('data/properties') if running from repo root
if not data_dir.exists():
    print(f"Error: Directory '{data_dir}' does not exist.")
    sys.exit(1)

poro_files = sorted([f for f in data_dir.iterdir() if "PORO" in f.name])
permx_files = sorted([f for f in data_dir.iterdir() if "PERMX" in f.name])

if not poro_files or not permx_files:
    print("Error: PORO or PERMX files not found.")
    sys.exit(1)

if len(poro_files) != len(permx_files):
    raise ValueError(f"Number of PORO files ({len(poro_files)}) does not match number of PERMX files ({len(permx_files)})")

# --------------------------
# Latin Hypercube Sampling
# --------------------------
# Sample 6 parameters + 1 index for matched poro/permx
sampler = qmc.LatinHypercube(d=len(params) + 1)
sample = sampler.random(n=num_samples)

# Scale physical parameters
sample_params = qmc.scale(sample[:, :len(params)], l_bounds, u_bounds)
df_params = pd.DataFrame(np.round(sample_params, 2), columns=params)

# Scale indices
file_indices = (sample[:, -1] * len(poro_files)).astype(int)
file_indices = np.clip(file_indices, 0, len(poro_files) - 1)

# Get matched file paths
df_params["poro_file"] = [str(poro_files[i]) for i in file_indices]
df_params["permx_file"] = [str(permx_files[i]) for i in file_indices]

# --------------------------
# Calculate stress state parameters
# --------------------------
beta = 300 - 90  # Rotate from SH to x-axis
cos_2beta = np.cos(np.radians(2 * beta))
sin_2beta = np.sin(np.radians(2 * beta))

df_params['sigma_x'] = (df_params['SH_MPa/km'] + df_params['Sh_MPa/km']) / 2 + \
                       (df_params['SH_MPa/km'] - df_params['Sh_MPa/km']) / 2 * cos_2beta
df_params['sigma_y'] = (df_params['SH_MPa/km'] + df_params['Sh_MPa/km']) / 2 - \
                       (df_params['SH_MPa/km'] - df_params['Sh_MPa/km']) / 2 * cos_2beta
df_params['tau_xy'] = (df_params['SH_MPa/km'] - df_params['Sh_MPa/km']) / 2 * sin_2beta

# --------------------------
# Output
# --------------------------
# df_params.to_csv("sampled_parameters_and_files.csv", index=False)
df_params


Unnamed: 0,E_GPa,PR,SH_MPa/km,Sh_MPa/km,Sv_MPa/km,SH_azi_deg,poro_file,permx_file,sigma_x,sigma_y,tau_xy
0,19.05,0.35,26.28,15.94,21.95,294.93,../data/properties/JD_BASECASE_5_PORO.dat,../data/properties/JD_BASECASE_5_PERMX.dat,23.695,18.525,4.477351
1,21.99,0.38,22.74,14.61,23.74,307.95,../data/properties/JD_BASECASE_6_PORO.dat,../data/properties/JD_BASECASE_6_PERMX.dat,20.7075,16.6425,3.520393
2,18.45,0.23,23.65,13.48,22.6,300.88,../data/properties/JD_BASECASE_3_PORO.dat,../data/properties/JD_BASECASE_3_PERMX.dat,21.1075,16.0225,4.403739
3,24.3,0.25,25.49,15.38,24.82,302.42,../data/properties/JD_BASECASE_2_PORO.dat,../data/properties/JD_BASECASE_2_PERMX.dat,22.9625,17.9075,4.377758
4,16.21,0.31,26.97,13.89,21.14,291.41,../data/properties/JD_BASECASE_2_PORO.dat,../data/properties/JD_BASECASE_2_PERMX.dat,23.7,17.16,5.663806


## PORO/PERMX pairs are NOT sampled more than once

Make sure the number of samples does not exceed the number of available pairs.

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import qmc
from pathlib import Path
import sys

# --------------------------
# Setup for sampling
# --------------------------
params = ['E_GPa', 'PR', 'SH_MPa/km', 'Sh_MPa/km', 'Sv_MPa/km', 'SH_azi_deg']
l_bounds = [15, 0.2, 25 * 0.9, 14.6 * 0.9, 22.7 * 0.9, 290]
u_bounds = [25, 0.4, 25 * 1.1, 14.6 * 1.1, 22.7 * 1.1, 310]
num_samples = 5  # Change as needed

# --------------------------
# Load PORO and PERMX files
# --------------------------
data_dir = Path('..') / 'data' / 'properties'
# Or use Path('data/properties') if running from repo root
if not data_dir.exists():
    print(f"Error: Directory '{data_dir}' does not exist.")
    sys.exit(1)

poro_files = sorted([f for f in data_dir.iterdir() if "PORO" in f.name])
permx_files = sorted([f for f in data_dir.iterdir() if "PERMX" in f.name])

if not poro_files or not permx_files:
    print("Error: PORO or PERMX files not found.")
    sys.exit(1)

if len(poro_files) != len(permx_files):
    raise ValueError(f"Number of PORO files ({len(poro_files)}) does not match number of PERMX files ({len(permx_files)})")

num_pairs = len(poro_files)

if num_samples > num_pairs:
    raise ValueError(f"Cannot sample {num_samples} unique poro/permx pairs: only {num_pairs} available.")

# --------------------------
# Latin Hypercube Sampling for parameters
# --------------------------
sampler = qmc.LatinHypercube(d=len(params))
sample = sampler.random(n=num_samples)
sample_params = qmc.scale(sample, l_bounds, u_bounds)
df_params = pd.DataFrame(np.round(sample_params, 2), columns=params)

# --------------------------
# Random unique selection of poro/permx pairs
# --------------------------
unique_indices = np.random.choice(num_pairs, size=num_samples, replace=False)
df_params["poro_file"] = [str(poro_files[i]) for i in unique_indices]
df_params["permx_file"] = [str(permx_files[i]) for i in unique_indices]

# --------------------------
# Calculate stress state parameters
# --------------------------
beta = 300 - 90  # Rotate from SH to x-axis
cos_2beta = np.cos(np.radians(2 * beta))
sin_2beta = np.sin(np.radians(2 * beta))

df_params['sigma_x'] = (df_params['SH_MPa/km'] + df_params['Sh_MPa/km']) / 2 + \
                       (df_params['SH_MPa/km'] - df_params['Sh_MPa/km']) / 2 * cos_2beta
df_params['sigma_y'] = (df_params['SH_MPa/km'] + df_params['Sh_MPa/km']) / 2 - \
                       (df_params['SH_MPa/km'] - df_params['Sh_MPa/km']) / 2 * cos_2beta
df_params['tau_xy'] = (df_params['SH_MPa/km'] - df_params['Sh_MPa/km']) / 2 * sin_2beta

# --------------------------
# Output
# --------------------------
df_params
df_params.to_csv("sampled_parameters_and_files.csv", index=False)


Unnamed: 0,E_GPa,PR,SH_MPa/km,Sh_MPa/km,Sv_MPa/km,SH_azi_deg,poro_file,permx_file,sigma_x,sigma_y,tau_xy
0,18.13,0.38,24.41,14.09,20.72,292.58,../data/properties/JD_BASECASE_1_PORO.dat,../data/properties/JD_BASECASE_1_PERMX.dat,21.83,16.67,4.468691
1,21.18,0.28,24.74,15.06,22.06,294.08,../data/properties/JD_BASECASE_2_PORO.dat,../data/properties/JD_BASECASE_2_PERMX.dat,22.32,17.48,4.191563
2,20.02,0.22,22.74,13.46,22.44,298.77,../data/properties/JD_BASECASE_6_PORO.dat,../data/properties/JD_BASECASE_6_PERMX.dat,20.42,15.78,4.018358
3,15.61,0.27,26.54,15.97,24.91,308.28,../data/properties/JD_BASECASE_4_PORO.dat,../data/properties/JD_BASECASE_4_PERMX.dat,23.8975,18.6125,4.576944
4,24.22,0.36,26.34,14.66,23.23,304.5,../data/properties/JD_BASECASE_5_PORO.dat,../data/properties/JD_BASECASE_5_PERMX.dat,23.42,17.58,5.057588
