In [3]:
from openff.evaluator.datasets.thermoml import ThermoMLDataSet



****** PyMBAR will use 64-bit JAX! *******
* JAX is currently set to 32-bit bitsize *
* which is its default.                  *
*                                        *
* PyMBAR requires 64-bit mode and WILL   *
* enable JAX's 64-bit mode when called.  *
*                                        *
* This MAY cause problems with other     *
* Uses of JAX in the same code.          *
******************************************



In [4]:
data_set = ThermoMLDataSet.from_doi(
    "10.1021/je1013476",
)

In [5]:
len(data_set)

271

In [6]:
len(data_set.substances)

251

In [7]:
print(data_set.property_types)

{'Density', 'EnthalpyOfMixing'}


In [8]:
from openff.evaluator.datasets.curation.components.filtering import (
    FilterByPropertyTypes,
    FilterByPropertyTypesSchema,
)

data_set = FilterByPropertyTypes.apply(
    data_set, FilterByPropertyTypesSchema(property_types=["Density"])
)

print(data_set.property_types)

{'Density'}


In [9]:
from openff.evaluator.datasets.curation.components.filtering import (
    FilterByPressure,
    FilterByPressureSchema,
    FilterByTemperature,
    FilterByTemperatureSchema,
)

print(f"There were {len(data_set)} properties before filtering")

# First filter by temperature.
data_set = FilterByTemperature.apply(
    data_set,
    FilterByTemperatureSchema(minimum_temperature=298.0, maximum_temperature=298.2),
)
# and then by pressure
data_set = FilterByPressure.apply(
    data_set, FilterByPressureSchema(minimum_pressure=101.224, maximum_pressure=101.426)
)

print(f"There are now {len(data_set)} properties after filtering")

There were 209 properties before filtering
There are now 5 properties after filtering


In [10]:
pandas_data_set = data_set.to_pandas()
pandas_data_set[
    [
        "Temperature (K)",
        "Pressure (kPa)",
        "Component 1",
        "Density Value (g / ml)",
        "Source",
    ]
].head()

Unnamed: 0,Temperature (K),Pressure (kPa),Component 1,Density Value (g / ml),Source
0,298.15,101.325,CCOC(=O)OCC,0.96915,10.1021/je1013476
1,298.15,101.325,CCO,0.78507,10.1021/je1013476
2,298.15,101.325,CCCO,0.79945,10.1021/je1013476
3,298.15,101.325,CCCCCO,0.81096,10.1021/je1013476
4,298.15,101.325,CCCCCCCCO,0.82174,10.1021/je1013476


In [11]:
## Now I want to add data 


In [12]:
from openff.units import unit

from openff.evaluator.thermodynamics import ThermodynamicState

thermodynamic_state = ThermodynamicState(
    temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere
)

In [13]:
from openff.evaluator.substances import Substance

#alkanes 
hexane = Substance.from_components("C" * 6)
octane = Substance.from_components("C" * 8)
decane = Substance.from_components("C" * 10)
dodecane = Substance.from_components("C" * 12) 
pentadecane = Substance.from_components("C" * 15)
hexadecane = Substance.from_components("C" * 16)

#alkenes 
one_hexene = Substance.from_components("CCCCC=C")
one_octene = Substance.from_components("CCCCCC=C")
one_octadecene = Substance.from_components("CCCCCCCCCCCCCCC=C")

In [14]:
from openff.evaluator.datasets import MeasurementSource

#SOurces for Hvap 
source1 = MeasurementSource(doi="10.1021/je960089h")
source2= MeasurementSource(doi="10.1063/1.555942")
source3 = MeasurementSource(doi="10.1021/je00014a009")
source4 = MeasurementSource(doi="10.1021/j150544a029")
source5 = MeasurementSource(doi="10.1021/jo00334a040")
#sources for density 
source6 = MeasurementSource(doi="10.1016/j.molliq.2020.114366")
source7 = MeasurementSource(doi="10.1021/acs.jced.8b01135")
source8 = MeasurementSource(doi="10.1021/acs.jced.3c00290")
source9 = MeasurementSource(doi="10.1016/j.jct.2018.08.036")
# source10 = MeasurementSource(doi="10.1021/j150544a029")   this is actually source 4
source11 = MeasurementSource(doi="Sigma Aldrich")

In [15]:
from openff.evaluator.datasets import PropertyPhase
from openff.evaluator.properties import EnthalpyOfVaporization
from openff.evaluator.properties import Density

# Add the enthalpy of vaporization data to the data set
# for hexane, octane, decane, dodecane, pentadecane, hexadecane
# and one hexene, one octene, and one octadecene
hexane_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=hexane,
    value=31.52 * unit.kilojoule / unit.mole,
    uncertainty=0.028 * unit.kilojoule / unit.mole,
    source=source2,
)
octane_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=octane,
    value=41.6 * unit.kilojoule / unit.mole,
    uncertainty=0 * unit.kilojoule / unit.mole,
    source=source1,
)
decane_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=decane,
    value=51.4 * unit.kilojoule / unit.mole,
    uncertainty=0 * unit.kilojoule / unit.mole,
    source=source3,
)
dodecane_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=dodecane,
    value=61.3 * unit.kilojoule / unit.mole,
    uncertainty=0.3 * unit.kilojoule / unit.mole,
    source=source1,
)
pentadecane_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=pentadecane,
    value=76.4 * unit.kilojoule / unit.mole,
    uncertainty=0.3 * unit.kilojoule / unit.mole,
    source=source1,
)
hexadecane_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=hexadecane,
    value=81.4 * unit.kilojoule / unit.mole,
    uncertainty=0.3 * unit.kilojoule / unit.mole,
    source=source3,
)
one_hexene_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=one_hexene,
    value=30.6 * unit.kilojoule / unit.mole,
    uncertainty=0 * unit.kilojoule / unit.mole,
    source=source4,
)
one_octene_hvap = EnthalpyOfVaporization(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid | PropertyPhase.Gas,
    substance=one_octene,
    value=40.65 * unit.kilojoule / unit.mole,
    uncertainty=0.3 * unit.kilojoule / unit.mole,
    source=source5,
)

# Now add the density data to the data set
hexane_density = Density(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid,
    substance=hexane,
    value=0.6555 * unit.gram / unit.milliliter,
    uncertainty=0.0003 * unit.gram / unit.milliliter,
    source=source6,
)
dodecane_density = Density(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid,
    substance=dodecane,
    value=0.7454 * unit.gram / unit.milliliter,
    uncertainty=0.00017 * unit.gram / unit.milliliter,
    source=source7,
)
pentadecane_density = Density(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid,
    substance=pentadecane,
    value=0.765 * unit.gram / unit.milliliter,
    uncertainty=0.00017 * unit.gram / unit.milliliter,
    source=source8,
)
hexadecane_density = Density(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid,
    substance=hexadecane,
    value=0.770 * unit.gram / unit.milliliter,
    uncertainty=0.00036 * unit.gram / unit.milliliter,
    source=source9,
)
one_hexene_density = Density(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid,
    substance=one_hexene,
    value=0.66845 * unit.gram / unit.milliliter,
    uncertainty=0 * unit.gram / unit.milliliter,
    source=source4,
)
one_octadecene_density = Density(
    thermodynamic_state=thermodynamic_state,
    phase=PropertyPhase.Liquid,
    substance=one_octadecene,
    value=0.789 * unit.gram / unit.milliliter,
    uncertainty=0 * unit.gram / unit.milliliter,
    source=source11,
)

In [16]:
data_set.add_properties(hexane_hvap, octane_hvap, decane_hvap, dodecane_hvap, pentadecane_hvap, hexadecane_hvap, one_hexene_hvap, one_octene_hvap, hexane_density, dodecane_density, pentadecane_density, hexadecane_density, one_hexene_density, one_octadecene_density)
 
print(f"There are now {len(data_set)} properties after adding new data")

There are now 19 properties after adding new data


In [17]:
from openff.evaluator.datasets.curation.components.filtering import (
    FilterBySmiles,
    FilterBySmilesSchema,
)

data_set = FilterBySmiles.apply(
    data_set, FilterBySmilesSchema(smiles_to_exclude=["CCO","CCCO", "CCCCCO", "CCOC(=O)OCC",  "CCCCCCCCO"])
)

print(f"There are now {len(data_set)} properties after filtering")

There are now 14 properties after filtering


In [18]:
pandas_data_set = data_set.to_pandas()
print(pandas_data_set)

                                  Id  Temperature (K)  Pressure (kPa)  \
0   f175dba1597046d3bde2061eb032fb60           298.15         101.325   
1   a1ccc86c6a5f409e918cb41656357864           298.15         101.325   
2   43b732a446074e548fa55f597b7c7f63           298.15         101.325   
3   887e4a874dc543bdac9181f342af94ac           298.15         101.325   
4   aa88cd2807134134a5a27bdb48ae6bf2           298.15         101.325   
5   215277391f884801a3e4afee19e20fc7           298.15         101.325   
6   641ff1031e444629acd6fd097ffcb9b5           298.15         101.325   
7   e0f38f18c2f24f21be6b10deb095ebe5           298.15         101.325   
8   387c88e226c246f081d3133689c14fdc           298.15         101.325   
9   9b9fe9ba6f1c4a51a4812a904ac45023           298.15         101.325   
10  843d82a359514fb9907bf1c7753c64ca           298.15         101.325   
11  40d504c70bea4fcbb1bd1dbcc31fab5d           298.15         101.325   
12  2d4777b7e86d47f28d6b40e55352ac86           298.

In [None]:
data_set.json("filtered_data_set_alkanes.json", format=True)

In [20]:
from openff.toolkit.topology import Molecule, Topology

all_smiles = {
    component.smiles
    for substance in data_set.substances
    for component in substance.components
}