# Read simulations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

DATA_DIRECTORY = Path('../data/')
RESULT_DIRECTORY = Path('../data_prepared/')
assert DATA_DIRECTORY.exists()
assert RESULT_DIRECTORY.exists()

simulations = sorted([d for d in DATA_DIRECTORY.iterdir() if d.name != "raw"])
for s in simulations:
    print(f'Found Simulation: {s.name}')

Found Simulation: HIST_SG1_2_SG5_7
Found Simulation: model
Found Simulation: Вынгаяхинское
Found Simulation: Д3
Found Simulation: Царичанское13Р
Found Simulation: Царичанское7Р
Found Simulation: Ягодное


# Read dat files

In [3]:
import os

format = ".dat"

def iterate_files_recursively(directory):
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            yield Path(os.path.join(root, file_name))

def find_common_prefix(filenames):
    if not filenames:
        return ""

    # Extract base filenames
    base_filenames = [Path(filename).name for filename in filenames]

    # Find the common prefix among the base filenames
    prefix = os.path.commonprefix(base_filenames)

    return prefix

def sort_filenames(filenames):
    if not filenames:
        return []

    # Convert Path objects to strings
    filenames = [str(filename) for filename in filenames]

    prefix = find_common_prefix(filenames)
    base_file = prefix + format  # Construct the base filename
    sorted_files = []

    # Check if any filename ends with base_file
    for filename in filenames:
        if filename.endswith(base_file):
            sorted_files.append(filename)
            filenames.remove(filename)
            break

    # Process numerical filenames
    num_files = []
    for filename in filenames:
        if filename.endswith(format) and filename != base_file:
            stem = Path(filename).stem
            try:
                num = int(stem.split('_')[-1])
                num_files.append((num, filename))
            except ValueError:
                pass  # Skip filenames with non-integer suffixes
        else:
            sorted_files.append(filename)  # Add non-numerical filenames directly

    num_files.sort()
    sorted_files.extend([filename for _, filename in num_files])

    return [Path(f) for f in sorted_files]

def read_dat_filenames(simulation: Path, verbose: bool = True):
    global format
    format = ".dat"

    dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
    dat_files = sort_filenames(dat_files)

    if len(dat_files) == 0:
        format = ".DATA"
        dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
        dat_files = sort_filenames(dat_files)

    if verbose:
        main_dat_file = dat_files[0]
        print(f'Main Simulation: {main_dat_file.name}')
        other_dat_files = [int(f.name.removesuffix(format).split('_')[-1]) for f in dat_files[1:]]
        assert other_dat_files == list(range(1, len(other_dat_files) + 1)), ".dat (or .DATA) files do not form a list of consecutive numbers. Probably you are dealing with the Вынгаяхинское folder. If so, delete all .dat files from data/Вынгаяхинское/300_kust_07052015/WELL"
        print(f'Other Simulations: [{other_dat_files[0]}...{other_dat_files[-1]}]')

    return dat_files


In [4]:
from dat_parser import extract_key_values

def read_dat_file(file: Path):
    return extract_key_values(file)


In [5]:
import pandas as pd

df_list = []

for simulation in simulations:
    dicts_list = []

    dat_files = read_dat_filenames(simulation)

    for filename in dat_files:
        dicts_list.append(read_dat_file(filename))

    # Convert list of dictionaries to DataFrame
    df_list.append(pd.DataFrame(dicts_list))


Main Simulation: 681_HIST_SG1_2_SG5_7.dat
Other Simulations: [1...1000]
Main Simulation: J1_HISTORY.DATA
Other Simulations: [1...1000]
Main Simulation: VinBP11_300_V_3.dat
Other Simulations: [1...1000]
Main Simulation: D3_aug2018.dat
Other Simulations: [1...1000]
Main Simulation: carich_DI.dat
Other Simulations: [1...1000]
Main Simulation: carich_DI.dat
Other Simulations: [1...1000]
Main Simulation: C1bb_07_2018.DATA
Other Simulations: [1...1000]


# Read result files

In [6]:
fname = "result.log"

def find_directories_with_result_logs(root_dir):
    result_log_directories = []
    for root, dirs, files in os.walk(root_dir):
        if "result.log" in files:
            result_log_directories.append(root)
    return [Path(f) for f in result_log_directories]

def iterate_files_recursively(directory):
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            yield Path(os.path.join(root, file_name))

def find_common_prefix(filenames):
    if not filenames:
        return ""

    # Extract base filenames
    base_filenames = [Path(filename).name for filename in filenames]

    # Find the common prefix among the base filenames
    prefix = os.path.commonprefix(base_filenames)

    return prefix

def sort_filenames(filenames):
    if not filenames:
        return []

    # Convert Path objects to strings
    filenames = [str(filename) for filename in filenames]

    prefix = find_common_prefix(filenames)
    print(prefix)
    base_file = prefix + format  # Construct the base filename
    sorted_files = []

    # Check if any filename ends with base_file
    for filename in filenames:
        if filename.endswith(base_file):
            sorted_files.append(filename)
            filenames.remove(filename)
            break

    # Process numerical filenames
    num_files = []
    for filename in filenames:
        if filename.endswith(format) and filename != base_file:
            stem = Path(filename).stem
            try:
                num = int(stem.split('_')[-1])
                num_files.append((num, filename))
            except ValueError:
                pass  # Skip filenames with non-integer suffixes
        else:
            sorted_files.append(filename)  # Add non-numerical filenames directly

    num_files.sort()
    sorted_files.extend([filename for _, filename in num_files])

    return [Path(f) for f in sorted_files]

def read_dat_filenames(simulation: Path, verbose: bool = True):
    global format
    format = ""

    log_dirs = [f for f in find_directories_with_result_logs(simulation)]
    log_dirs = sort_filenames(log_dirs)
    # print(log_dirs)

    # if len(dat_files) == 0:
    #     format = ".DATA"
    #     dat_files = [f for f in iterate_files_recursively(simulation) if f.name.endswith(format)]
    #     dat_files = sort_filenames(dat_files)

    if verbose:
        main_dat_file = log_dirs[0]
        print(f'Main Simulation: {main_dat_file.name}')
        other_dat_files = [int(f.name.removesuffix(format).split('_')[-1]) for f in log_dirs[1:]]
        # assert other_dat_files == list(range(1, len(other_dat_files) + 1)), ".dat (or .DATA) files do not form a list of consecutive numbers. Probably you are dealing with the Вынгаяхинское folder. If so, delete all .dat files from data/Вынгаяхинское/300_kust_07052015/WELL"
        print(f'Other Simulations: {other_dat_files}')

    return list(zip([Path(os.path.join(root, "result.log")) for root in log_dirs], [0] + other_dat_files))


In [7]:
from table_parser import parse_table
from block_finder import find_block_with_min_lines

# parse result.log file
def read_res_file(file: Path, number = -1):
    d = parse_table(find_block_with_min_lines(file))
    d["no."] = number
    return d


In [8]:
parsed_logs = [pd.DataFrame([read_res_file(f[0],f[1]) for f in read_dat_filenames(sim)]) for sim in simulations]


681_HIST_SG1_2_SG5_7
Main Simulation: 681_HIST_SG1_2_SG5_7
Other Simulations: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206,

In [9]:
df_list[0]

Unnamed: 0,MBERRLIM,TOLLIN,TOLVARNEWT,CHECKP,MBERRCTRL,MINNEWTIT
0,,,,,,
1,0.0,0.0001,0.00001,1.0,1.000000e-07,
2,0.0,0.0001,0.00001,1.0,1.000000e-07,0.0
3,0.0,0.0001,0.00001,1.0,1.000000e-06,
4,0.0,0.0001,0.00001,1.0,1.000000e-06,0.0
...,...,...,...,...,...,...
996,2.0,0.0050,0.10000,2.0,1.000000e-05,0.0
997,2.0,0.0050,0.10000,2.0,1.000000e-04,
998,2.0,0.0050,0.10000,2.0,1.000000e-04,0.0
999,2.0,0.0050,0.10000,2.0,1.000000e-03,


# Merge parsed tables

In [10]:
merged_tables = [pd.merge(df_list[i], parsed_logs[i], left_index=True, right_on='no.') for i in range(len(df_list))]


In [11]:
# Discard non-ubiquitous columns

res = set(merged_tables[0].columns.tolist())
for i in range(1,len(merged_tables)):
    res = res.intersection(merged_tables[i].columns.tolist())
res = list(res)
merged_tables = [merged_tables[i][res] for i in range(len(merged_tables))]


In [12]:
for i in range(len(merged_tables)):
    merged_tables[i].to_csv(os.path.join(RESULT_DIRECTORY, f"table_{i}.csv"))