# Read simulations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

DATA_DIRECTORY = Path('../data/')
RESULT_DIRECTORY = Path('../data_prepared/')
assert DATA_DIRECTORY.exists()
assert RESULT_DIRECTORY.exists()

simulations = sorted([d for d in DATA_DIRECTORY.iterdir() if d.name != "raw"])
for s in simulations:
    print(f'Found Simulation: {s.name}')

Found Simulation: HIST_SG1_2_SG5_7
Found Simulation: model
Found Simulation: Вынгаяхинское
Found Simulation: Д3
Found Simulation: Царичанское13Р
Found Simulation: Царичанское7Р
Found Simulation: Ягодное


# Read dat files

In [3]:
import os

format = ".dat"

def iterate_files_recursively(directory):
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            yield Path(os.path.join(root, file_name))

def find_common_prefix(filenames):
    if not filenames:
        return ""

    # Extract base filenames
    base_filenames = [Path(filename).name for filename in filenames]

    # Find the common prefix among the base filenames
    prefix = os.path.commonprefix(base_filenames)

    return prefix

def sort_filenames(filenames):
    if not filenames:
        return []

    # Convert Path objects to strings
    filenames = [str(filename) for filename in filenames]

    prefix = find_common_prefix(filenames)
    base_file = prefix + format  # Construct the base filename
    sorted_files = []

    # Check if any filename ends with base_file
    for filename in filenames:
        if filename.endswith(base_file):
            sorted_files.append(filename)
            filenames.remove(filename)
            break

    # Process numerical filenames
    num_files = []
    for filename in filenames:
        if filename.endswith(format) and filename != base_file:
            stem = Path(filename).stem
            try:
                num = int(stem.split('_')[-1])
                num_files.append((num, filename))
            except ValueError:
                pass  # Skip filenames with non-integer suffixes
        else:
            sorted_files.append(filename)  # Add non-numerical filenames directly

    num_files.sort()
    sorted_files.extend([filename for _, filename in num_files])

    return [Path(f) for f in sorted_files]

def read_dat_filenames(simulation: Path, verbose: bool = True):
    global format
    format = ".dat"

    dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
    dat_files = sort_filenames(dat_files)

    if len(dat_files) == 0:
        format = ".DATA"
        dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
        dat_files = sort_filenames(dat_files)

    if verbose:
        main_dat_file = dat_files[0]
        print(f'Main Simulation: {main_dat_file.name}')
        other_dat_files = [int(f.name.removesuffix(format).split('_')[-1]) for f in dat_files[1:]]
        assert other_dat_files == list(range(1, len(other_dat_files) + 1)), ".dat (or .DATA) files do not form a list of consecutive numbers. Probably you are dealing with the Вынгаяхинское folder. If so, delete all .dat files from data/Вынгаяхинское/300_kust_07052015/WELL"
        print(f'Other Simulations: [{other_dat_files[0]}...{other_dat_files[-1]}]')

    return dat_files


In [4]:
from dat_parser import extract_key_values

def read_dat_file(file: Path):
    return extract_key_values(file)


In [5]:
import pandas as pd

df_list = []

for simulation in simulations:
    dicts_list = []

    dat_files = read_dat_filenames(simulation)

    for filename in dat_files:
        dicts_list.append(read_dat_file(filename))

    # Convert list of dictionaries to DataFrame
    df_list.append(pd.DataFrame(dicts_list))


Main Simulation: 681_HIST_SG1_2_SG5_7.dat
Other Simulations: [1...1000]
Main Simulation: J1_HISTORY.DATA
Other Simulations: [1...1000]
Main Simulation: VinBP11_300_V_3.dat
Other Simulations: [1...1000]
Main Simulation: D3_aug2018.dat
Other Simulations: [1...1000]
Main Simulation: carich_DI.dat
Other Simulations: [1...1000]
Main Simulation: carich_DI.dat
Other Simulations: [1...1000]
Main Simulation: C1bb_07_2018.DATA
Other Simulations: [1...1000]


# Read result files

In [30]:
fname = "result.log"

def find_directories_with_result_logs(root_dir):
    result_log_directories = []
    for root, dirs, files in os.walk(root_dir):
        if "result.log" in files:
            result_log_directories.append(root)
    return [Path(f) for f in result_log_directories]

def iterate_files_recursively(directory):
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            yield Path(os.path.join(root, file_name))

def find_common_prefix(filenames):
    if not filenames:
        return ""

    # Extract base filenames
    base_filenames = [Path(filename).name for filename in filenames]

    # Find the common prefix among the base filenames
    prefix = os.path.commonprefix(base_filenames)

    return prefix

def sort_filenames(filenames):
    if not filenames:
        return []

    # Convert Path objects to strings
    filenames = [str(filename) for filename in filenames]

    prefix = find_common_prefix(filenames)
    print(prefix)
    base_file = prefix + format  # Construct the base filename
    sorted_files = []

    # Check if any filename ends with base_file
    for filename in filenames:
        if filename.endswith(base_file):
            sorted_files.append(filename)
            filenames.remove(filename)
            break

    # Process numerical filenames
    num_files = []
    for filename in filenames:
        if filename.endswith(format) and filename != base_file:
            stem = Path(filename).stem
            try:
                num = int(stem.split('_')[-1])
                num_files.append((num, filename))
            except ValueError:
                pass  # Skip filenames with non-integer suffixes
        else:
            sorted_files.append(filename)  # Add non-numerical filenames directly

    num_files.sort()
    sorted_files.extend([filename for _, filename in num_files])

    return [Path(f) for f in sorted_files]

def read_dat_filenames(simulation: Path, verbose: bool = True):
    global format
    format = ""

    log_dirs = [f for f in find_directories_with_result_logs(simulation)]
    log_dirs = sort_filenames(log_dirs)
    # print(log_dirs)

    # if len(dat_files) == 0:
    #     format = ".DATA"
    #     dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
    #     dat_files = sort_filenames(dat_files)

    if verbose:
        main_dat_file = log_dirs[0]
        print(f'Main Simulation: {main_dat_file.name}')
        other_dat_files = [int(f.name.removesuffix(format).split('_')[-1]) for f in log_dirs[1:]]
        # assert other_dat_files == list(range(1, len(other_dat_files) + 1)), ".dat (or .DATA) files do not form a list of consecutive numbers. Probably you are dealing with the Вынгаяхинское folder. If so, delete all .dat files from data/Вынгаяхинское/300_kust_07052015/WELL"
        print(f'Other Simulations: {other_dat_files}')

    return list(zip([Path(os.path.join(root, "result.log")) for root in log_dirs], [0] + other_dat_files))


In [49]:
from table_parser import parse_table
from block_finder import find_last_block_with_min_lines, count_data_blocks_with_min_lines

# parse result.log file
def read_res_file(file: Path, number = -1):
    d = parse_table(find_last_block_with_min_lines(file))
    d["no."] = number
    d["num_blocks"] = count_data_blocks_with_min_lines(file)
    return d


In [50]:
parsed_logs = [pd.DataFrame([read_res_file(f[0],f[1]) for f in read_dat_filenames(sim)]) for sim in simulations]


681_HIST_SG1_2_SG5_7
Main Simulation: 681_HIST_SG1_2_SG5_7
Other Simulations: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206,

In [51]:
parsed_logs[0].drop_duplicates()

Unnamed: 0,Характеристики,Текущие запасы нефти в ОТЧ.РЕГ. тыс.ст.м3,Начальные запасы нефти в ОТЧ.РЕГ. тыс.ст.м3,Текущие подвижные запасы нефти в ОТЧ.РЕГ. (отн. воды) тыс.ст.м3,Начальные подвижные запасы нефти в ОТЧ.РЕГ. (отн. воды) тыс.ст.м3,Текущие подвижные запасы нефти в ОТЧ.РЕГ. (отн. газа) ст.м3,Начальные подвижные запасы нефти в ОТЧ.РЕГ. (отн. газа) ст.м3,Текущие запасы свободной нефти в ОТЧ.РЕГ. тыс.ст.м3,Начальные запасы свободной нефти в ОТЧ.РЕГ. тыс.ст.м3,Текущие запасы летучей нефти в ОТЧ.РЕГ. ст.м3,...,Ошибка материального баланса для компонента OIL кг,Начальная масса компонента GAS млн.кг,Текущая масса компонента GAS млн.кг,Добыча компонента GAS тыс.кг,Закачка компонента GAS кг,Поток компонента через границу региона GAS кг,Ошибка материального баланса для компонента GAS кг,no.,num_blocks,Ошибка материального баланса для компонента WATER кг
0,Месторождение,19893.31,20028.56,7180.33,7296.33,211787.60,221558.09,19893.31,20028.56,0.00,...,-31558.70,2968.90,2949.64,19268.25,0.00,0.00,-5967.58,0,502,
1,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-22935.31,2968.90,2949.63,19274.10,0.00,0.00,-4096.11,1,502,39083.30
2,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-22935.31,2968.90,2949.63,19274.10,0.00,0.00,-4096.11,2,502,39083.30
3,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-22935.31,2968.90,2949.63,19274.10,0.00,0.00,-4096.11,3,502,39083.30
4,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-22935.31,2968.90,2949.63,19274.10,0.00,0.00,-4096.11,4,502,39083.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-228427.67,2968.90,2949.70,19242.20,0.00,0.00,-42828.87,996,502,
997,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-228427.67,2968.90,2949.70,19242.20,0.00,0.00,-42828.87,997,502,
998,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-228427.67,2968.90,2949.70,19242.20,0.00,0.00,-42828.87,998,502,
999,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-228427.67,2968.90,2949.70,19242.20,0.00,0.00,-42828.87,999,502,


# Merge parsed tables

In [52]:
merged_tables = [pd.merge(df_list[i], parsed_logs[i], left_index=True, right_on='no.') for i in range(len(df_list))]


In [53]:
# Discard non-ubiquitous columns

res = set(merged_tables[0].columns.tolist())
for i in range(1,len(merged_tables)):
    res = res.intersection(merged_tables[i].columns.tolist())
res = list(res)
merged_tables = [merged_tables[i][res] for i in range(len(merged_tables))]


In [54]:
for i in range(len(merged_tables)):
    merged_tables[i].to_csv(os.path.join(RESULT_DIRECTORY, f"table_{i}.csv"))

In [55]:
merged_tables[i]

Unnamed: 0,MBERRCTRL,TOLVARNEWT,Ошибка материального баланса для компонента WATER кг,MBERRLIM,Ср. давление в резервуаре [wpv] бар абс.,Текущие запасы воды в ОТЧ.РЕГ. тыс.ст.м3,Начальная масса компонента WATER млн.кг,Текущие запасы летучей нефти в ОТЧ.РЕГ. ст.м3,Текущая масса компонента WATER млн.кг,CHECKP,...,TOLLIN,no.,Поровый объём (пл.усл.) тыс.пласт.м3,Начальные подвижные запасы воды в ОТЧ.РЕГ. тыс.ст.м3,Ошибка материального баланса для компонента OIL кг,num_blocks,Поток компонента через границу региона OIL кг,Текущие подвижные запасы воды в ОТЧ.РЕГ. тыс.ст.м3,Ср. насыщенность нефти,Ср. насыщенность воды
0,,,-28405.49,,138.98,4269.22,4953.67,0.00,4926.68,,...,,0,4998.86,3709.57,-1163.11,414,0.00,3693.04,0.129891,0.870109
1,1.000000e-07,0.00001,-46.91,0.0,138.93,4269.18,4953.67,0.00,4926.63,1.0,...,0.0001,1,4998.85,3709.57,-35.04,414,0.00,3693.00,0.129896,0.870104
2,1.000000e-07,0.00001,-198.42,0.0,138.93,4269.18,4953.67,0.00,4926.63,1.0,...,0.0001,2,4998.85,3709.57,-65.78,414,0.00,3693.01,0.129896,0.870104
3,1.000000e-06,0.00001,-46.91,0.0,138.93,4269.18,4953.67,0.00,4926.63,1.0,...,0.0001,3,4998.85,3709.57,-35.04,414,0.00,3693.00,0.129896,0.870104
4,1.000000e-06,0.00001,-198.42,0.0,138.93,4269.18,4953.67,0.00,4926.63,1.0,...,0.0001,4,4998.85,3709.57,-65.78,414,0.00,3693.01,0.129896,0.870104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,1.000000e-05,0.10000,-643428.68,2.0,139.98,4269.69,4953.67,0.00,4927.22,2.0,...,0.0050,996,4999.07,3709.57,33052.89,414,0.00,3693.46,0.129872,0.870128
997,1.000000e-04,0.10000,,2.0,144.03,4271.08,4953.67,0.00,4928.82,2.0,...,0.0050,997,4999.90,3709.57,-418082.78,414,0.00,3694.66,0.129899,0.870101
998,1.000000e-04,0.10000,,2.0,145.51,4271.55,4953.67,0.00,4929.37,2.0,...,0.0050,998,5000.20,3709.57,-358334.84,414,0.00,3695.06,0.129916,0.870084
999,1.000000e-03,0.10000,,2.0,151.22,4273.71,4953.67,0.00,4931.86,2.0,...,0.0050,999,5001.37,3709.57,,414,0.00,3696.98,0.129913,0.870087
