# Run the first 2 cells:

_Cell 1: from jupyterlab template_: run it

In [1]:
import sys
from pathlib import Path
import time
import numpy as np
from pprint import pprint as pp
import matplotlib as mpl
from matplotlib import pyplot as plt
plt.ion()
#plt.style.use('seaborn-v0_8-muted')
#from IPython.display import HTML, Markdown #, IFrame

# To get multiple outputs into 1 cell w/o using print:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# autoreload extension
from IPython import get_ipython

ipython = get_ipython()
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# -----------------------------------------
# TWO USEFUL FUNCTIONS:

def add_to_sys_path(this_path, up=False):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    if up:
        newp = Path(this_path).parent
    else:
        newp = Path(this_path)
    src = newp.joinpath("src")
    if src.exists():
        newp = str(src)
    else:
        newp = str(newp)
    if newp not in sys.path:
        sys.path.insert(1, newp)
        print('Path added to sys.path: {}'.format(newp))

# Filtered dir() for method discovery:
def fdir(obj, start_with_str='_', exclude=True):
    return [d for d in dir(obj) if not d.startswith(start_with_str) == exclude]


_Cell 2: from jupyterlab template_: Uncomment (and amend it) to enable import of local modules.

In [2]:
# Insert current src dir into sys.path so that modules in ../src can be imported:
# CHANGE THIS IF NEEDED:

add_to_sys_path(Path.cwd(), up=True)

Path added to sys.path: /home/cat/projects/MCCE_Scikit/src


---
---

# MCCE - MS Sampling (using test data in ../tests/data/)
---
# Workflow to producing a collections of pdbs from sampled microstates
#### 5 steps to pdbs!

### 1. Necessary imports
```
from pathlib import Path
import numpy as np
import time         # only needed if you want to time a process

import base
import mcce_io as io
import ms_sampling as sampling
```
### 2. Path definition
 * mcce_output_path: path to  where a MCCE simulation was run. Must include step2_out.pdb, head3.lst, ms_out dir.

### 3. MS class instanciation:
 * The class needs values for pH and Eh in addition to the MCCE output path.
 * Call using variables:
```
pH, Eh = 5.0, 0.0   # can be 5, 0 (int) as well
ms = base.MS(mcce_output_path, pH, Eh)
```

### 4. Define arguments for sampling and pdb creation:
 - sample size
 - "sort by" key
 - output folder (optional: if not given, the output is given by ms.msout_file_dir)

#### Note: Rationale for using a folder created from the msout file, e.g. pH5eH0ms/:
The pdb file names only have the chosen MC and selected ms index as identifiers, hence,  
a file must be open them to obtain the T, pH and Eh information, so keeping them in a folder  
named after the msout file they come from is the simplest way to keep things tidy.

```
n_sample_size = 4
ms_sort_by = "energy"

# optional:
output_dir = some_folder_path  # defaults to ms.msout_file_dir if not given
```

### 5. Function call to create pdbs from sampled ms:
```
start_time = time.time()        # optional

sampling.pdbs_from_ms_samples(ms,
                              mcce_output_path,
                              n_sample_size,
                              ms_sort_by,
                              output_dir,
                              clear_pdbs_folder=True,  # default:True
                              list_files=True          # default:False
                            )

# next 2 lines: # optional
d = time.time() - start_time    
print(f"`ms_sampling.pdbs_from_ms_samples` with sample size={n_sample_size:,} took {d/60:.2f} mins or {d:.2f} seconds")
```

---
# Example using repo data

In [3]:
import base
import mcce_io as io
import ms_sampling as sampling

In [4]:
mcce_output_path = Path.cwd().parent.joinpath("tests/data")
mcce_output_path

!ls {mcce_output_path}

PosixPath('/home/cat/projects/MCCE_Scikit/tests/data')

head3.lst  ms_out  run.prm.record  step2_out.pdb


# base.MC class

In [5]:
print(base.MS.__doc__)
print(base.MS.__init__.__doc__)

Uses split ms_out files.
MS.init

        Parameters:
            mcce_output_path (str): A MCCE simulation output folder.
            pH (int or float): A pH point.
            Eh (int or float): A Eh point.
            selected_MC (int): The index of an MC run; one of `range(constants.MONTERUNS)`.
            overwrite_split_files (bool): whether to redo the splitting of msout_file.
        


In [7]:
# create instance

pH, Eh = 5.0, 0.0

start_time = time.time()

ms = base.MS(mcce_output_path, pH, Eh)

d = time.time() - start_time
print(f"Loading of base.MS instance took {d/60:.2f} mins or {d:.2f} seconds")
print(ms)

Loading of base.MS instance took 0.14 mins or 8.54 seconds
MS("/home/cat/projects/MCCE_Scikit/tests/data", 5.0, 0.0, selected_MC=0, overwrite_split_files=False)


In [8]:
# Public vars in MC:  (uncomment to view)
#fdir(ms)

# Call to `ms_sampling.pdbs_from_ms_samples`

In [9]:
#fdir(sampling)  (uncomment to view)

In [10]:
# create pdbs from samples ms

n_sample_size = 4
ms_sort_by = "energy"

start_time = time.time()

sampling.pdbs_from_ms_samples(ms,
                              mcce_output_path,
                              n_sample_size,
                              ms_sort_by,
                              clear_pdbs_folder=True,  # default:True
                              list_files=True          # default:False
                            )

d = time.time() - start_time
print(f"`ms_sampling.pdbs_from_ms_samples` with sample size={n_sample_size:,} took {d/60:.2f} mins or {d:.2f} seconds")

Creating n=4 MCCE_PDB files in /home/cat/projects/MCCE_Scikit/tests/data/ms_out/pH5eH0ms from (n) microstates sorted by 'energy'.
 NOTE: the output pdb will be free of any water molecules in step2_out.pdb.
PDB files creation over.
Files in /home/cat/projects/MCCE_Scikit/tests/data/ms_out/pH5eH0ms/pdbs_from_ms:

	 /home/cat/projects/MCCE_Scikit/tests/data/ms_out/pH5eH0ms/pdbs_from_ms/mc0_ms230849.pdb
	 /home/cat/projects/MCCE_Scikit/tests/data/ms_out/pH5eH0ms/pdbs_from_ms/mc0_ms114217.pdb
	 /home/cat/projects/MCCE_Scikit/tests/data/ms_out/pH5eH0ms/pdbs_from_ms/mc0_ms1.pdb
	 /home/cat/projects/MCCE_Scikit/tests/data/ms_out/pH5eH0ms/pdbs_from_ms/mc0_ms349302.pdb
`ms_sampling.pdbs_from_ms_samples` with sample size=4 took 0.02 mins or 1.07 seconds


# Inspect a pdb head:

In [11]:
!head -n 20 ../tests/data/ms_out/pH5eH0ms/pdbs_from_ms/mc0_ms1.pdb


REMARK 250
REMARK 250 EXPERIMENTAL DETAILS
REMARK 250   EXPERIMENT TYPE               : MCCE simulation
REMARK 250   DATE OF DATA COLLECTION       : 26-Oct-23
REMARK 250   REMARK: DATE OF DATA COLLECTION is the date this pdb was created.
REMARK 250 EXPERIMENTAL CONDITIONS
REMARK 250   TEMPERATURE                   : 298.15 (K)
REMARK 250   PH                            : 5.00
REMARK 250   EH                            : 0.00
REMARK 250   METHOD                        : MONTERUNS
REMARK 250   SELECTED MONTERUN             : 0
REMARK 250   SELECTED MICROSTATE INDEX     : 1
REMARK 250   SELECTED MICROSTATE ENERGY    : 202.89 (kcal/mol)
REMARK 250
ATOM      1  CA  NTR A0001_001   2.696   5.785  12.711   2.000       0.001      01O000M000 
ATOM      2  HA  NTR A0001_001   3.149   5.444  11.801   0.000       0.000      01O000M000 
ATOM      3  N   NTR A0001_001   2.812   4.829  13.856   1.500      -0.003      01O000M000 
ATOM      4  H   NTR A0001_001   2.419   3.912  13.784   1.000       0.