In [None]:
import os
import sys
import subprocess
import signac
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statistics import linear_regression
from scipy.stats import linregress
from pymser import pymser

In [None]:
def plot_res_pymser(job, eq_col, results, name, box_name, eq_fig = False):
    fig, [ax1, ax2] = plt.subplots(1, 2, gridspec_kw={'width_ratios': [2, 1]}, sharey=True)

    ax1.set_ylabel(name, color="black", fontsize=14, fontweight='bold')
    ax1.set_xlabel("GEMC step", fontsize=14, fontweight='bold')

    ax1.plot(range(len(eq_col)), 
            eq_col, 
            label = 'Raw data', 
            color='blue')

    ax1.plot(range(len(eq_col))[results['t0']:], 
            results['equilibrated'], 
            label = 'Equilibrated data', 
            color='red')

    ax1.plot([0, len(eq_col)], 
            [results['average'], results['average']], 
            color='green', zorder=4, 
            label='Equilibrated average')

    ax1.fill_between(range(len(eq_col)), 
                    results['average'] - results['uncertainty'], 
                    results['average'] + results['uncertainty'], 
                    color='lightgreen', alpha=0.3, zorder=4)

    ax1.set_yticks(np.arange(0, eq_col.max()*1.1, eq_col.max()/10))
    ax1.set_xlim(-len(eq_col)*0.02, len(eq_col)*1.02)
    ax1.tick_params(axis="y", labelcolor="black")

    ax1.grid(alpha=0.3)
    ax1.legend()

    ax2.hist(eq_col, 
            orientation=u'horizontal', 
            bins=30, 
            edgecolor='blue', 
            lw=1.5, 
            facecolor='white', 
            zorder=3)

    ax2.hist(results['equilibrated'], 
            orientation=u'horizontal', 
            bins=3, 
            edgecolor='red', 
            lw=1.5, 
            facecolor='white', 
            zorder=3)

    ymax = int(ax2.get_xlim()[-1])

    ax2.plot([0, ymax], 
            [results['average'], results['average']],
            color='green', zorder=4, label='Equilibrated average')

    ax2.fill_between(range(ymax), 
                    results['average'] - results['uncertainty'],
                    results['average'] + results['uncertainty'],
                    color='lightgreen', alpha=0.3, zorder=4)

    ax2.set_xlim(0, ymax)

    ax2.grid(alpha=0.5, zorder=1)

    fig.set_size_inches(9,5)
    fig.set_dpi(100)
    fig.tight_layout()
    save_name = 'MSER_eq_'+ box_name +'.png'
    plt.show()
#     print("Success! Saving figure")
#     fig.savefig(job.fn(save_name), dpi=300, facecolor='white')
#     plt.close(fig)

In [None]:
def check_equil_converge(job, eq_data_dict, prod_tol):
    equil_matrix = []
    res_matrix = []
    prop_cols = [5]
    prop_names = ["Number of Moles"]
    try:
        # Load data for both boxes
        for key in list(eq_data_dict.keys()):
            eq_col = eq_data_dict[key]["data"]
        # df_box1 = np.genfromtxt(job.fn("gemc.eq.out.box1.prp"))
        # df_box2 = np.genfromtxt(job.fn("gemc.eq.out.box2.prp"))

        # Process both boxes in one loop
        # for box in [df_box1, df_box2]:
            # for prop_index in prop_cols:
            #     eq_col = box[:, prop_index - 1]
            # print(len(eq_col))
            batch_size = max(1, int(len(eq_col) * 0.0005))

            # Try with ADF test enabled, fallback without it if it fails
            try:
                results = pymser.equilibrate(eq_col, LLM=False, batch_size=batch_size, ADF_test=True, uncertainty='uSD', print_results=False)
                adf_test_failed = results["critical_values"]["1%"] <= results["adf"]
            except:
                results = pymser.equilibrate(eq_col, LLM=False, batch_size=batch_size, ADF_test=False, uncertainty='uSD', print_results=False)
                results["adf"], results["critical_values"], adf_test_failed = None, None, False

            equilibrium = len(eq_col) - results['t0'] >= prod_tol
            equil_matrix.append(equilibrium and not adf_test_failed)
            res_matrix.append(results)

        # Log results
        # print("ID", job.id, "AT", job.sp.atom_type, "T", job.sp.T)
        # print(equil_matrix)
        # log_text = '==============================================================================\n'
        
        for i, is_equilibrated in enumerate(equil_matrix):
            # box = df_box1 if i < len(prop_cols) else df_box2
            # box_name = "Liquid" if i < len(prop_cols) else "Vapor"
            # col_vals = box[:, prop_cols[i % len(prop_cols)] - 1]
            key_name = list(eq_data_dict.keys())[i]
            box_name = key_name.rsplit("_", 1)[0]
            col_vals = eq_data_dict[key_name]["data"]
            #plot all

            if all(equil_matrix):
                plot_res_pymser(job, col_vals, res_matrix[i], prop_names[i % len(prop_cols)], box_name)

            # Display outcome
            prod_cycles = len(col_vals) - res_matrix[i]['t0']
            if is_equilibrated:
                #Plot successful equilibration
                statement = f"       > Success! Found {prod_cycles} production cycles."
            else:
                #Plot failed equilibration
                statement = f"       > {box_name} Box Failure! "
                if res_matrix[i]["adf"] is None:
                    # Note: ADF test failed to complete
                    statement += f"ADF test failed to complete! "
                elif res_matrix[i]['adf'] > res_matrix[i]['critical_values']['1%']:
                    adf, one_pct = res_matrix[i]['adf'], res_matrix[i]['critical_values']['1%']
                    statement += f"ADF value: {adf}, 99% confidence value: {one_pct}! "
                if len(col_vals) - res_matrix[i]['t0'] < prod_tol:
                    statement += f"Only {prod_cycles} production cycles found."
                
            # print(statement)

    except Exception as e:
        #This will cause an error in the GEMC operation which lets us know that the job failed
        raise Exception(f"Error processing job {job.id}: {e}")

    return all(equil_matrix) 

In [None]:
def run_gemc(job):
    "Run gemc"
    # Move into the job dir and start doing things
    use_crit = False
    try:
        #Inititalize counter and number of eq_steps
        count = 1
        total_eq_steps = job.sp.nsteps_eq
        prod_tol = int(job.sp.nsteps_eq/4)
        #Originally set the document eq_steps to 1 larger than the max number, it will be overwritten later
        total_steps = int(job.sp.nsteps_eq*4+1)
        with job:

            prop_cols = [5] #Use number of moles to decide equilibrium
            # Load initial eq data from both boxes
            df_box1 = np.genfromtxt(job.fn("gemc.eq.out.box1.prp"))
            df_box2 = np.genfromtxt(job.fn("gemc.eq.out.box2.prp"))

            # Process both boxes in one loop
            eq_data_dict = {}
            for b, box in enumerate([df_box1, df_box2]):
                box_name = "Liquid" if b == 0 else "Vapor"
                for prop_index in prop_cols:
                    eq_col = box[:, prop_index - 1]
                    #Save eq_col as a csv for later analysis
                    key = f"{box_name}_{prop_index}"
                    eq_col_file = job.fn(f"{box_name}_eq_col_{prop_index}.csv")
                    # np.savetxt(eq_col_file, eq_col, delimiter=",")
                    #Save the eq_col and file to a dictionary for later use
                    eq_data_dict[key] = {"data": eq_col, "file": eq_col_file}

            prod_tol_eq = int(eq_data_dict[key]["data"].size/4)
            # print(f"Prod Tol: {prod_tol_eq}")
            #While we are using at most 12 attempts to equilibrate
            while count <= 13:
                # Check if equilibration is reached via the pymser algorithms
                is_equil = check_equil_converge(job, eq_data_dict, prod_tol_eq)
                #If equilibration is reached, break the loop and start production
                if is_equil:
                    break
                else:
                    #Increase the total number of eq steps by 25% of the original value and restart the simulation
                    total_eq_steps += int(prod_tol)
                    #If we've exceeded the maximum number of equilibrium steps, raise an exception
                    #This forces a retry with critical conditions or will note complete GEMC failure
                    if count == 13:
                        # job.doc.equil_fail = True
                        use_crit = True
                        raise Exception(f"GEMC equilibration failed to converge after {job.sp.nsteps_eq*4} steps")
                    #Otherwise continue equilibration
                    else:
                        #Add restart data to eq_col
                        # After each restart, load the updated properties data for both boxes
                        #Use fake data to extend simulation
                        df_box1r = np.genfromtxt(job.fn("gemc.eq.out.box1.prp"))[int(3*len(df_box1)/4):]
                        df_box2r = np.genfromtxt(job.fn("gemc.eq.out.box2.prp"))[int(3*len(df_box2)/4):]


                        # Process and add the restart data to eq_col for each property in each box
                        for b, box in enumerate([df_box1r, df_box2r]):
                            box_name = "Liquid" if b == 0 else "Vapor"
                            for i, prop_index in enumerate(prop_cols):
                                #Get the key from the property and box name
                                key = f"{box_name}_{prop_index}"
                                # Extract the column data for this restart and append to accumulated data
                                eq_col_restart = box[:, prop_index - 1]
                                eq_col_data = eq_data_dict[key]["data"]
                                all_eq_data = np.concatenate((eq_col_data, eq_col_restart))
                                #Save the new data to the eq_col file
                                # print(eq_data_dict[key]["file"])
                                # np.savetxt(eq_data_dict[key]["file"], all_eq_data, delimiter=",")
                                #Overwite the current data in the eq_data_dict with restart data
                                # print("orig len", len(eq_data_dict[key]["data"]))
                                # print(eq_data_dict[key]["data"][0:5])
                                eq_data_dict[key]["data"] = all_eq_data
                                # print("totla len", len(eq_data_dict[key]["data"]))
                #Increase the counter
                count += 1

            #Set the step counter to whatever the final number of equilibration steps was
            total_steps = total_eq_steps

            # Run production
            print("Running production")
    except:
        # if GEMC failed with critical conditions as intial conditions, terminate with error
        if use_crit == True:
            # If so, terminate with error and log failure in job document
            # job.doc.gemc_failed = True
            print("GEMC failed with critical and experimental starting conditions and the molecule is " + job.sp.mol_name + " at temperature " + str(job.sp.T))
            # )
        else:
            print("Reatart w/ Critical Conditions")
            # Otherwise, try with critical conditions


In [None]:
import glob
def process_old_job(jobs):
    "Run gemc"
    for job in jobs:
        print("ID", job.id, "AT", job.sp.atom_type, "T", job.sp.T)
        prop_cols = [5] #Use number of moles to decide equilibrium
        # Load initial eq data from both boxes
        try:
            with job:
                df_box1 = np.genfromtxt(job.fn("gemc.eq.out.box1.prp"))
                df_box2 = np.genfromtxt(job.fn("gemc.eq.out.box2.prp"))
                print(df_box1.shape, df_box2.shape)
                # Process both boxes in one loop
                eq_data_dict = {}
                for b, box in enumerate([df_box1, df_box2]):
                    box_name = "Liquid" if b == 0 else "Vapor"
                    for prop_index in prop_cols:
                        eq_col = box[:, prop_index - 1]
                        #Save eq_col as a csv for later analysis
                        key = f"{box_name}_{prop_index}"
                        eq_col_file = job.fn(f"{box_name}_eq_col_{prop_index}.csv")
                        # np.savetxt(eq_col_file, eq_col, delimiter=",")
                        #Save the eq_col and file to a dictionary for later use
                        eq_data_dict[key] = {"data": eq_col, "file": eq_col_file}
                prod_tol_eq = int(eq_data_dict[key]["data"].size/4) 
                is_equilibrated = check_equil_converge(job, eq_data_dict, prod_tol_eq)
                #If all jobs pass, set nsteps_eq needed to the original amount
                if is_equilibrated:
                    print("Equilibrated setting job doc nsteps")
                    pattern = "gemc.eq.rst.001*"
                    for filename in glob.glob(pattern):
                        # Create the new name by replacing the prefix
                        new_name = filename.replace("gemc.eq.rst.001", "prod", 1)
                        print(new_name)
                        # os.rename(filename, new_name)
                    # job.doc.nsteps_eq = job.sp.nsteps_eq
                    #Change file name from .rst.001 to prod
                #Otherwise, delete job production information, equil info will be retained and used since job.doc.nsteps_eq is not set 
                else:
                    with job:
                        print("Deleting production data")
                        # if "nmols_vap" in job.doc:
                        #     del job.doc["nmols_vap"]
                        # for file_path in glob.glob(os.path.join(job.fn(""), "gemc.eq.rst.001*")):
                        #     print("Deleting", file_path)
                        #     os.remove(file_path)
                            #pass
        except:
            print("Error processing job", job.id)


In [None]:
import glob
mol_name = "R50"
project = signac.get_project("opt_ff_ms")
jobs = project.find_jobs({"mol_name": mol_name, "T": 130, "atom_type": 1})
jobs = project.find_jobs()
# for job in jobs:
#     print(job.id)
process_old_job(jobs)

In [None]:
import glob
mol_name = "R170"
project = signac.get_project("opt_ff_ms")
jobs = project.find_jobs({"mol_name": mol_name, "T": 250, "atom_type": 8})
jobs = project.find_jobs({"mol_name": mol_name})
# jobs = list(project.find_jobs({"mol_name": mol_name, "T": 290, "atom_type": 6}))[0]
# jobs = list(project.find_jobs({"mol_name": mol_name, "T": 230, "atom_type": 1}))[0]
# print(job)
for job in jobs:
    # print("ID", job.id, "AT", job.sp.atom_type, "T", job.sp.T)
    run_gemc(job)