# Read simulations

In [89]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [90]:
from pathlib import Path

DATA_DIRECTORY = Path('../data/')
RESULT_DIRECTORY = Path('../data_prepared/')
assert DATA_DIRECTORY.exists()
assert RESULT_DIRECTORY.exists()

simulations = sorted([d for d in DATA_DIRECTORY.iterdir() if d.name != "raw"])
for s in simulations:
    print(f'Found Simulation: {s.name}')

Found Simulation: HIST_SG1_2_SG5_7
Found Simulation: model
Found Simulation: Вынгаяхинское
Found Simulation: Д3
Found Simulation: Царичанское13Р
Found Simulation: Царичанское7Р
Found Simulation: Ягодное


# Read dat files

In [91]:
import os

format = ".dat"

def iterate_files_recursively(directory):
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            yield Path(os.path.join(root, file_name))

def find_common_prefix(filenames):
    if not filenames:
        return ""

    # Extract base filenames
    base_filenames = [Path(filename).name for filename in filenames]

    # Find the common prefix among the base filenames
    prefix = os.path.commonprefix(base_filenames)

    return prefix

def sort_filenames(filenames):
    if not filenames:
        return []

    # Convert Path objects to strings
    filenames = [str(filename) for filename in filenames]

    prefix = find_common_prefix(filenames)
    base_file = prefix + format  # Construct the base filename
    sorted_files = []

    # Check if any filename ends with base_file
    for filename in filenames:
        if filename.endswith(base_file):
            sorted_files.append(filename)
            filenames.remove(filename)
            break

    # Process numerical filenames
    num_files = []
    for filename in filenames:
        if filename.endswith(format) and filename != base_file:
            stem = Path(filename).stem
            try:
                num = int(stem.split('_')[-1])
                num_files.append((num, filename))
            except ValueError:
                pass  # Skip filenames with non-integer suffixes
        else:
            sorted_files.append(filename)  # Add non-numerical filenames directly

    num_files.sort()
    sorted_files.extend([filename for _, filename in num_files])

    return [Path(f) for f in sorted_files]

def read_dat_filenames(simulation: Path, verbose: bool = True):
    global format
    format = ".dat"

    dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
    dat_files = sort_filenames(dat_files)

    if len(dat_files) == 0:
        format = ".DATA"
        dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
        dat_files = sort_filenames(dat_files)

    if verbose:
        main_dat_file = dat_files[0]
        print(f'Main Simulation: {main_dat_file.name}')
        other_dat_files = [int(f.name.removesuffix(format).split('_')[-1]) for f in dat_files[1:]]
        assert other_dat_files == list(range(1, len(other_dat_files) + 1)), ".dat (or .DATA) files do not form a list of consecutive numbers. Probably you are dealing with the Вынгаяхинское folder. If so, delete all .dat files from data/Вынгаяхинское/300_kust_07052015/WELL"
        print(f'Other Simulations: [{other_dat_files[0]}...{other_dat_files[-1]}]')

    return dat_files


In [92]:
from dat_parser import extract_key_values

def read_dat_file(file: Path):
    return extract_key_values(file)


In [93]:
import pandas as pd

df_list = []

for simulation in simulations:
    dicts_list = []

    dat_files = read_dat_filenames(simulation)

    for filename in dat_files:
        dicts_list.append(read_dat_file(filename))

    # Convert list of dictionaries to DataFrame
    df_list.append(pd.DataFrame(dicts_list))


Main Simulation: 681_HIST_SG1_2_SG5_7.dat
Other Simulations: [1...1000]
Main Simulation: J1_HISTORY.DATA
Other Simulations: [1...1000]
Main Simulation: VinBP11_300_V_3.dat
Other Simulations: [1...1000]
Main Simulation: D3_aug2018.dat
Other Simulations: [1...1000]
Main Simulation: carich_DI.dat
Other Simulations: [1...1000]
Main Simulation: carich_DI.dat
Other Simulations: [1...1000]
Main Simulation: C1bb_07_2018.DATA
Other Simulations: [1...1000]


# Read result files

In [94]:
fname = "result.log"

def find_directories_with_result_logs(root_dir):
    result_log_directories = []
    for root, dirs, files in os.walk(root_dir):
        if "result.log" in files:
            result_log_directories.append(root)
    return [Path(f) for f in result_log_directories]

def iterate_files_recursively(directory):
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            yield Path(os.path.join(root, file_name))

def find_common_prefix(filenames):
    if not filenames:
        return ""

    # Extract base filenames
    base_filenames = [Path(filename).name for filename in filenames]

    # Find the common prefix among the base filenames
    prefix = os.path.commonprefix(base_filenames)

    return prefix

def sort_filenames(filenames):
    if not filenames:
        return []

    # Convert Path objects to strings
    filenames = [str(filename) for filename in filenames]

    prefix = find_common_prefix(filenames)
    print(prefix)
    base_file = prefix + format  # Construct the base filename
    sorted_files = []

    # Check if any filename ends with base_file
    for filename in filenames:
        if filename.endswith(base_file):
            sorted_files.append(filename)
            filenames.remove(filename)
            break

    # Process numerical filenames
    num_files = []
    for filename in filenames:
        if filename.endswith(format) and filename != base_file:
            stem = Path(filename).stem
            try:
                num = int(stem.split('_')[-1])
                num_files.append((num, filename))
            except ValueError:
                pass  # Skip filenames with non-integer suffixes
        else:
            sorted_files.append(filename)  # Add non-numerical filenames directly

    num_files.sort()
    sorted_files.extend([filename for _, filename in num_files])

    return [Path(f) for f in sorted_files]

def read_dat_filenames(simulation: Path, verbose: bool = True):
    global format
    format = ""

    log_dirs = [f for f in find_directories_with_result_logs(simulation)]
    log_dirs = sort_filenames(log_dirs)
    # print(log_dirs)

    # if len(dat_files) == 0:
    #     format = ".DATA"
    #     dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
    #     dat_files = sort_filenames(dat_files)

    if verbose:
        main_dat_file = log_dirs[0]
        print(f'Main Simulation: {main_dat_file.name}')
        other_dat_files = [int(f.name.removesuffix(format).split('_')[-1]) for f in log_dirs[1:]]
        # assert other_dat_files == list(range(1, len(other_dat_files) + 1)), ".dat (or .DATA) files do not form a list of consecutive numbers. Probably you are dealing with the Вынгаяхинское folder. If so, delete all .dat files from data/Вынгаяхинское/300_kust_07052015/WELL"
        print(f'Other Simulations: {other_dat_files}')

    return list(zip([Path(os.path.join(root, "result.log")) for root in log_dirs], [0] + other_dat_files))


In [95]:
from big_vector_processor import parse_big_file
import numpy as np

ntgs = []
poros = []
perms = []

paths_ntg = ["J1_HISTORY_PROP_NTG.GRDECL",
             "INPUT/NTG.GRDECL",
             "300_kust_07052015/GEO/NTG.INC", 
             "grid_aug2018/ntg.GRDECL",
             "grid/NTG_7.GRDECL",
             "grid/NTG.GRDECL",
             "Geol/NTG.GRDECL"]
paths_poro = ["J1_HISTORY_PROP_PORO.GRDECL", 
              "INPUT/PORO.GRDECL", 
              "300_kust_07052015/GEO/PORO.INC", 
              "grid_aug2018/poro.GRDECL",
              "grid/Poro_7.GRDECL",
              "grid/Poro.GRDECL",
              "Geol/Poro.GRDECL"]
paths_perm = ["INPUT/PERMX.GRDECL",
              "J1_HISTORY_PROP_PERMX.GRDECL",
              "C1bb_07_2018_PERMX.grdecl",
              "grid_aug2018/permx.GRDECL",
              "grid/Perm_7.GRDECL",
              "grid/Perm.GRDECL",
              "zatychka.txt"]

for i in range(len(simulations)):
    for path in paths_ntg:
        file_path = Path(os.path.join(simulations[i], path))
        if file_path.is_file():
            ntgs.append(file_path)
            print(f"Foung NTG file {file_path}")
            continue
    for path in paths_poro:
        file_path = Path(os.path.join(simulations[i], path))
        if file_path.is_file():
            poros.append(file_path)
            print(f"Foung PORO file {file_path}")
            continue
    for path in paths_perm:
        file_path = Path(os.path.join(simulations[i], path))
        if file_path.is_file():
            perms.append(file_path)
            print(f"Foung PERM file {file_path}")
            continue

lists_ntg = []
lists_poro = []
lists_perm = []

for path in ntgs:
    print(path)
    lists_ntg.append(np.array(parse_big_file(path)))
    mask = lists_ntg[-1] != 0
    lists_ntg[-1] = np.sort(lists_ntg[-1][mask])
for path in poros:
    print(path)
    lists_poro.append(np.array(parse_big_file(path)))
    mask = lists_poro[-1] != 0
    lists_poro[-1] = np.sort(lists_poro[-1][mask])
for path in perms:
    print(path)
    lists_perm.append(np.array(parse_big_file(path)))
    mask = lists_perm[-1] != 0
    lists_perm[-1] = np.sort(lists_perm[-1][mask])


Foung NTG file ../data/HIST_SG1_2_SG5_7/INPUT/NTG.GRDECL
Foung PORO file ../data/HIST_SG1_2_SG5_7/INPUT/PORO.GRDECL
Foung PORO file ../data/HIST_SG1_2_SG5_7/INPUT/PERMX.GRDECL
Foung NTG file ../data/model/J1_HISTORY_PROP_NTG.GRDECL
Foung PORO file ../data/model/J1_HISTORY_PROP_PORO.GRDECL
Foung PORO file ../data/model/J1_HISTORY_PROP_PERMX.GRDECL
Foung NTG file ../data/Вынгаяхинское/300_kust_07052015/GEO/NTG.INC
Foung PORO file ../data/Вынгаяхинское/300_kust_07052015/GEO/PORO.INC
Foung PORO file ../data/Вынгаяхинское/zatychka.txt
Foung NTG file ../data/Д3/grid_aug2018/ntg.GRDECL
Foung PORO file ../data/Д3/grid_aug2018/poro.GRDECL
Foung PORO file ../data/Д3/grid_aug2018/permx.GRDECL
Foung NTG file ../data/Царичанское13Р/grid/NTG.GRDECL
Foung PORO file ../data/Царичанское13Р/grid/Poro.GRDECL
Foung PORO file ../data/Царичанское13Р/grid/Perm.GRDECL
Foung NTG file ../data/Царичанское7Р/grid/NTG_7.GRDECL
Foung PORO file ../data/Царичанское7Р/grid/Poro_7.GRDECL
Foung PORO file ../data/Царичан

In [96]:
from log_parser import parse_model_statistics, calculate_date_difference, get_time
from table_parser import parse_table
from block_finder import find_last_block_with_min_lines

# parse result.log file
def read_res_file(file: Path, num: int, number = -1) -> dict:
    global lists_ntg
    global lists_poro
    d = parse_table(find_last_block_with_min_lines(file))
    d["no."] = number
    d["longitude"] = calculate_date_difference(file)
    d["ntg_mean"] = np.mean(lists_ntg[num])
    d["ntg_median"] = np.median(lists_ntg[num])
    d["ntg_variance"] = np.std(lists_ntg[num])
    d["ntg_fraction"] = lists_ntg[num][round(len(lists_ntg[num]) * 0.8)] / lists_ntg[num][round(len(lists_ntg[num]) * 0.2)]
    d["poro_mean"] = np.mean(lists_poro[num])
    d["poro_median"] = np.median(lists_poro[num])
    d["poro_variance"] = np.std(lists_poro[num])
    d["poro_fraction"] = lists_poro[num][round(len(lists_poro[num]) * 0.8)] / lists_poro[num][round(len(lists_poro[num]) * 0.2)]
    d["perm_mean"] = np.mean(lists_perm[num])
    d["perm_median"] = np.median(lists_perm[num])
    d["perm_variance"] = np.std(lists_perm[num])
    d["perm_fraction"] = lists_perm[num][round(len(lists_perm[num]) * 0.8)] / lists_perm[num][round(len(lists_perm[num]) * 0.2)]
    d["total_time"] = get_time(file)
    d.update(parse_model_statistics(file))
    return d


In [97]:
parsed_logs = [pd.DataFrame([read_res_file(f[0],i,f[1]) for f in read_dat_filenames(simulations[i])]) for i in range(len(simulations))]


681_HIST_SG1_2_SG5_7
Main Simulation: 681_HIST_SG1_2_SG5_7
Other Simulations: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206,

In [98]:
parsed_logs[0].drop_duplicates()

Unnamed: 0,Характеристики,Текущие запасы нефти в ОТЧ.РЕГ. тыс.ст.м3,Начальные запасы нефти в ОТЧ.РЕГ. тыс.ст.м3,Текущие подвижные запасы нефти в ОТЧ.РЕГ. (отн. воды) тыс.ст.м3,Начальные подвижные запасы нефти в ОТЧ.РЕГ. (отн. воды) тыс.ст.м3,Текущие подвижные запасы нефти в ОТЧ.РЕГ. (отн. газа) ст.м3,Начальные подвижные запасы нефти в ОТЧ.РЕГ. (отн. газа) ст.м3,Текущие запасы свободной нефти в ОТЧ.РЕГ. тыс.ст.м3,Начальные запасы свободной нефти в ОТЧ.РЕГ. тыс.ст.м3,Текущие запасы летучей нефти в ОТЧ.РЕГ. ст.м3,...,Начальные запасы свободного газа в ОТЧ.РЕГ.,Начальные запасы растворённого газа в ОТЧ.РЕГ.,Начальные подвижные запасы газа в ОТЧ.РЕГ.,Начальные запасы углеводородов в ОТЧ.РЕГ.,Начальные извлекаемые запасы углеводородов в ОТЧ.РЕГ.,Поровый объём при P(ref),Поровый объём,Summary at time=0.0000000000000000e+00,Метод расчёта,Ошибка материального баланса для компонента WATER кг
0,Месторождение,19893.31,20028.56,7180.33,7296.33,211787.60,221558.09,19893.31,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.49, CPU=2.39, CPUUSE=20.8%, MEM=146...",полностью неявный.,
1,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.26, CPU=2.31, CPUUSE=20.6%, MEM=146...",полностью неявный.,39083.30
2,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.55, CPU=2.36, CPUUSE=20.4%, MEM=145...",полностью неявный.,39083.30
3,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.12, CPU=2.22, CPUUSE=19.9%, MEM=146...",полностью неявный.,39083.30
4,Месторождение,19893.26,20028.56,7180.33,7296.33,211778.26,221558.09,19893.26,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.12, CPU=2.36, CPUUSE=21.2%, MEM=146...",полностью неявный.,39083.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.39, CPU=2.24, CPUUSE=19.6%, MEM=148...",полностью неявный.,
997,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.25, CPU=2.26, CPUUSE=20.1%, MEM=147...",полностью неявный.,
998,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.03, CPU=2.27, CPUUSE=20.5%, MEM=147...",полностью неявный.,
999,Месторождение,19893.75,20028.56,7180.43,7296.33,211803.59,221558.09,19893.75,20028.56,0.00,...,-0.0000 ст.м3.,2701.0510 млн. ст.м3.,985.1469 млн. ст.м3.,28.0270 млн. пласт.м3.,10.2160 млн. пласт.м3.,379.3240 млн. пласт.м3.,379.4445 млн. пласт.м3.,"elapsed=11.22, CPU=2.28, CPUUSE=20.3%, MEM=148...",полностью неявный.,


# Merge parsed tables

In [99]:
merged_tables = [pd.merge(df_list[i], parsed_logs[i], left_index=True, right_on='no.') for i in range(len(df_list))]


In [100]:
# Discard non-ubiquitous columns

res = set(merged_tables[0].columns.tolist())
for i in range(1,len(merged_tables)):
    res = res.intersection(merged_tables[i].columns.tolist())
res = list(res)
merged_tables = [merged_tables[i][res] for i in range(len(merged_tables))]


In [132]:
# Removing some of the hanging dots

for i in range(len(merged_tables)):
    merged_tables[i]["Число скважин"] = merged_tables[i]["Число скважин"].str.split('.').str.get(0)
    merged_tables[i]["Количество активных блоков"] = merged_tables[i]["Количество активных блоков"].str.split('.').str.get(0)
    merged_tables[i]["Число инт. перфораций"] = merged_tables[i]["Число инт. перфораций"].str.split('.').str.get(0)
    merged_tables[i]["Число компонентов (включая воду)"] = merged_tables[i]["Число компонентов (включая воду)"].str.split('.').str.get(0)
    merged_tables[i]["Количество шагов"] = merged_tables[i]["Количество шагов"].str.split('.').str.get(0)
    merged_tables[i]["Макс. число участков перфорации на скважину"] = merged_tables[i]["Макс. число участков перфорации на скважину"].str.split('.').str.get(0)
    merged_tables[i]["Начальная дата"] = merged_tables[i]["Начальная дата"].str.split('.').str.get(0)
    merged_tables[i]["Размер образа процесса"] = merged_tables[i]["Размер образа процесса"].str.split('.').str.get(0)

In [133]:
for i in range(len(merged_tables)):
    merged_tables[i].to_csv(os.path.join(RESULT_DIRECTORY, f"table_{i}.csv"), index=False)