# Starexec Data Extraction

#### Imports

In [1]:
import pandas as pd
import os
import cProfile
import re
import sys
from pathlib import Path
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

starexec_outputs = "data/starexec_outputs"
job_index = 5197
problem_folder = starexec_outputs + f"/Job{str(job_index)}_output/"

### StarExec csv output

In [2]:
df_se_csv = pd.read_csv(problem_folder + f"Job{str(job_index)}_info.csv")
df_se_csv
# df_se_csv.sort_values("wallclock time").reset_index(drop=True)[["cpu time","wallclock time"]].plot(style=".")
# plt.legend()

Unnamed: 0,pair id,benchmark,benchmark id,solver,solver id,configuration,configuration id,status,cpu time,wallclock time,memory usage,result,expected,SZSResult,SZSOutput,SZSStatus
0,131414576,jannis_gehring/TPTP v9.0.0/Problems/TOP/TOP008...,4193021,2.0.0,3773,PyRes_rd_002,400684,timeout (cpu),5.02,5.19192,329012.0,--,Satisfiable,UNK-Non,Non,UNK
1,131414577,jannis_gehring/TPTP v9.0.0/Problems/TOP/TOP008...,4193021,2.0.0,3773,PyRes_rd_003,400687,timeout (cpu),5.01,5.17461,327988.0,ResourceOut,Satisfiable,UNK-Non,Non,UNK
2,131414578,jannis_gehring/TPTP v9.0.0/Problems/TOP/TOP008...,4193021,2.0.0,3773,PyRes_rd_005,400685,timeout (cpu),5.03,5.20651,324916.0,ResourceOut,Satisfiable,UNK-Non,Non,UNK
3,131414579,jannis_gehring/TPTP v9.0.0/Problems/TOP/TOP008...,4193021,2.0.0,3773,PyRes_rd_008,400681,timeout (cpu),5.03,5.20119,326964.0,ResourceOut,Satisfiable,UNK-Non,Non,UNK
4,131414580,jannis_gehring/TPTP v9.0.0/Problems/TOP/TOP008...,4193021,2.0.0,3773,PyRes_rd_013,400683,timeout (cpu),5.01,5.17801,326964.0,ResourceOut,Satisfiable,UNK-Non,Non,UNK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33323,131447899,jannis_gehring/TPTP v9.0.0/Problems/MED/MED009...,4178902,2.0.0,3773,PyRes_rd_008,400681,timeout (cpu),5.02,5.20418,286628.0,ResourceOut,Theorem,UNK-Non,Non,UNK
33324,131447900,jannis_gehring/TPTP v9.0.0/Problems/MED/MED009...,4178902,2.0.0,3773,PyRes_rd_013,400683,timeout (cpu),5.03,5.33047,286808.0,ResourceOut,Theorem,UNK-Non,Non,UNK
33325,131447901,jannis_gehring/TPTP v9.0.0/Problems/MED/MED009...,4178902,2.0.0,3773,PyRes_rd_021,400688,timeout (cpu),5.02,5.19421,286432.0,ResourceOut,Theorem,UNK-Non,Non,UNK
33326,131447902,jannis_gehring/TPTP v9.0.0/Problems/MED/MED009...,4178902,2.0.0,3773,PyRes_rd_033,400686,timeout (cpu),5.01,5.23310,286660.0,ResourceOut,Theorem,UNK-Non,Non,UNK


### StarExec individual job files

In [None]:
load_from_file = False
store = True
if load_from_file:
    df_se_indi = pd.read_csv(problem_folder + f"df_se_indi_{str(job_index)}.csv", index_col=0)
else:
    print(f"Searching in {problem_folder}")

    # Get file texts
    file_dir_names = [
        subdir+"/"+files[0]
        for (subdir, dirs, files) in os.walk(problem_folder + "jannis_gehring")
        if len(dirs)==0
    ]
    texts = []
    for full_file_name in tqdm(file_dir_names):
        if not os.path.isfile(full_file_name):
            texts.append(None)
            continue
        with open(full_file_name) as full_file_name:
            s_mp_runs = full_file_name.read()
            texts.append(s_mp_runs)
    df_se_indi = pd.DataFrame({
        "file_dir_name": file_dir_names,
        "text": texts,
    })

    # Search through problems with regex
    # todo: fix: this regex patterns doesnt allow for lines starting with 79.84/10.64
    new_column_specs = [
        ("problem",                         r"(?<=Problem    : )(\S+)(?= : )",               str),
        ("python_version",                  r"(?<=Python )(\S+)",                            str),
        ("cpu_limit",                       r"(?<=% CPULimit   : )\S+",                      int),
        ("pyres_version",                   r"(?<=% Version:  )(\S+)",                       str),
        ("rel_distance",                    r"(?<=# rel_distance: )\S+",                     int),
        ("graph_construction_time",         r"(?<=# graph_construction_time: )(\S+)",        float),
        ("neighbourhood_computation_time",  r"(?<=# neighbourhood_computation_time: )(\S+)", float),
        ("szs_status",                      r"(?<=% SZS status )(\S+)",                      str),
        ("initial_clauses",                 r"(?<=% Initial clauses    : )(\S+)",            int),
        ("processed_clauses",               r"(?<=% Processed clauses  : )(\S+)", 			 int),
        ("factors_computed",                r"(?<=% Factors computed   : )(\S+)", 			 int),
        ("resolvents_computed",             r"(?<=% Resolvents computed: )(\S+)", 			 int),
        ("tautologies_deleted",             r"(?<=% Tautologies deleted: )(\S+)", 			 int),
        ("forward_subsumed",                r"(?<=% Forward subsumed   : )(\S+)", 			 int),
        ("backward_subsumed",               r"(?<=% Backward subsumed  : )(\S+)", 			 int),
        ("user_time",                       r"(?<=% User time          : )(\S+)", 			 float),
        ("system_time",                     r"(?<=% System time        : )(\S+)", 			 float),
        ("total_time",                      r"(?<=% Total time         : )(\S+)", 			 float),
    ]

    for new_column_name, pattern, col_type in tqdm(new_column_specs, desc="Searching for regex patterns"):
        df_se_indi[new_column_name] = (
            df_se_indi["text"]
            .apply(lambda text: re.search(pattern + r"|$", text)[0])
            .apply(lambda text: col_type(text) if text !="" else None)
        )
    df_se_indi = df_se_indi.drop(["text"], axis="columns")

if store:
    df_se_indi.to_csv(problem_folder + f"df_se_indi_{str(job_index)}.csv")

df_se_indi

Searching in data/starexec_outputs/Job5197_output/


 62%|██████▏   | 20517/33328 [00:37<30:16,  7.05it/s]  

In [17]:

# pd.DataFrame([["foo","bar","foo","foo","foo","foo","bar","fbr"],["foo","bar","foo","foo","foo","foo","bar","fbr"],["foo","bar","foo","foo","foo","foo","bar","fbr"]], columns=["Hello", "Hello1",  "Hello1", "hello",  "Hello1",  "Hello1",  "Hello1",  "Hello1"]).to_xml("hello.xml")


pd.read_xml("test_xml_download.xml")

Unnamed: 0,name,JobAttributes,JobPair
0,test_xml_download,\n,
