In [None]:
####### packages #######
import pandas as pd
import re
from pathlib import Path
from io import StringIO
import numpy as np



####### read in test files #######

def read_in_file(file_path):
    print(f"\nProcessing file: {file_path.name}")
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.readlines()
    print(f"Read {len(content)} lines from file")
    return content



####### separate tests in file #######

# splits file in list of tests, stores as {tests} <key=test_number>
def get_tests(file_contents):
    test_number = -1
    tests = {}
    # Use while loop for lookahead and test detection
    for line in file_contents:
        # normalizes string
        line = str(line).upper().strip()
        # looks for TEST then a digit, re.search returns None if can't find pattern
        test_number_match = re.search("TEST\\s+\\d\\d\\d\\d", line)
        # print(line[test_number_match.start():])
        
        # ensure test number exists and doesn't equal the previous test number
        if (test_number_match is not None) and (line[test_number_match.start():] != test_number):
            # assumes test number will be end of string, gets start of match to EOL
            test_number = line[test_number_match.start():test_number_match.end()]
        if test_number != -1:
            if test_number in tests:
                tests[test_number].append(line)
            else:
                tests[test_number] = [line]
    return tests



####### separate metadata from test data #######

def get_data(data):
    # data  = list of lines (test)
    
    dataStart = -1
    dataEnd = -1
    index = 0
    has_page = False
    for line in data:
        line = str(line.upper())
        if "PAGE" in line:
            if dataStart == -1:
                 dataStart = index + 1
            has_page = True
        if ("---" == line or index == len(data)-1) and has_page:
            has_page = False
            dataEnd = index + 1
        index += 1

    test_data = data[dataStart:dataEnd]
    metadata = data[:dataStart] + data[dataEnd:]
    
    # convert test_data to df
    pd_format_test_data = StringIO("\n".join(test_data))
    test_data_df = pd.read_csv(pd_format_test_data, sep="|")

    return test_data_df, metadata



####### metadata clean and output functions #######

# clean and output metadata as json
def output_json(input):

    metadata = []
    for line in input:
        # finds all space blocks separating potential metadata values
        # assumes metadata blocks are separated by at least 3 whitespaces
        match_whitespace = re.search("\\s{3,}", line)
        match_semicolon = re.search(";", line)
        while match_whitespace is not None or match_semicolon is not None:

            # checks which match is closer to the beginning (grabs only 1 metadata block)
            if match_whitespace is not None:
                match = match_whitespace
            else:
                match = match_semicolon
            if match_whitespace is not None and match_semicolon is not None:
                if match_semicolon.start() < match_whitespace.start():
                    match = match_semicolon
                else: 
                    match = match_semicolon

            metadata.append(line[:match.start()])
            # trims line up to end of last match
            line = line[match.end():]
            match_whitespace = re.search("\\s{3,}", line)
            match_semicolon = re.search(";", line)
        if line.strip() != "":
            metadata.append(line)
    # metadata = list of metadata blocks as str
    print(metadata)

    # metadata table checker
    # if line[0] == | then until not | at line[0] add to a table


    # # Determine output path
    # Path(OUTPUT_DIR_JSON / str(test_year)).mkdir(parents=True, exist_ok=True)

    # data_output_path = Path(OUTPUT_DIR_JSON) / str(test_year) /f"{Path(file_path).stem}.csv"
    # metadata_output_path = Path(OUTPUT_DIR_JSON) / str(test_year) / f"{Path(file_path).stem}.json"



####### test_data clean and output functions #######

# helper for file with 1 test split across multiple pages 
def merge_col_on_time(df):
#     df.merge(<col_index>=<time>)
    return df

# clean and output test_data as csv
def output_csv(contents):
    csv = contents.to_csv
    return csv

In [79]:
# Define input/output directories
INPUT_DIR = Path("./multi-test.md")
OUTPUT_DIR_MD = Path("../data/test_output/split_md")
OUTPUT_DIR_CSV = Path("../data/test_output/csvs")
OUTPUT_DIR_JSON = Path("../data/test_output/jsons")

# Create output directories if they don't exist
OUTPUT_DIR_MD.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_CSV.mkdir(parents=True, exist_ok=True)


file_contents = read_in_file(INPUT_DIR)

# get_tests -> list of tests
tests = get_tests(file_contents)
for test in tests:
    test_data_df, metadata = get_data(tests[test])
    output_json(metadata)
    # output_csv(test_data_df)
    


Processing file: multi-test.md
Read 2199 lines from file
['NBS CONE CALORIMETER', 'VER. DATARED 87.0728', '27 AUG 1986', 'TEST 2238', '.123 IN SAMPLE H', '100 KW/M2', 'HOR', 'TEST 2238', 'IRRADIANCE = 100 KW/M2', 'ORIENTATION WAS HORIZONTAL', 'PRE-TEST COMMENTS:', 'AL FOIL PAN.', 'NO SPARK IGN.', 'HYDROCARBON ANALYZER SET AT 10000 PPM = 10 VOLTS.', 'CO ANALYZER SET AT 5000 PPM = 10 VOLTS.', 'MASS RATIO AVERAGE= 466', 'SOOT AVERAGE= .024907 G/G', 'TIME TO IGNITION= 16.5 S', 'INITIAL MASS= 33.2 G ', 'FINAL MASS= 0.0 G', 'MASS CONSUMED= 33.2 G - 100.0 %', 'PEAK Q-DOT= 1516 KW/M2 AT 65 S', 'PEAK M-DOT= 49.39 G/S-M2 AT 60 S', 'M-DOT= 48.06 G/S-M2 AT 65 S', 'TOTAL HEAT RELEASE= 100 MJ/M2', '| YIELD | TEST AVG. | AT PEAK TIME ( 65 S) | |', '|-------|-----------|------------------------|--|', '| CO | .0872 | .0800 | KG/KG |', '| CO2 | 2.149 | 2.247 | KG/KG |', '| H2O | .746 | .621 | KG/KG |', '| HCL | 0.0000 | 0.0000 | KG/KG |', "| H'CARBS | .0668 | .0571 | KG/KG |", '| HEAT OF COMB. | 29.9 |