In [1]:
import thermopyl as th
from thermopyl import thermoml_lib
from thermopyl.utils import pandas_dataframe
import cirpy
import numpy as np
import pandas as pd
from sklearn.externals.joblib import Memory
from os import getcwd

In [2]:
mem = Memory(cachedir="/Users/guilhermematos/.thermoml/")
@mem.cache
def resolve_cached(x, rtype):
    return cirpy.resolve(x, rtype)

In [3]:
df = pandas_dataframe()

In [4]:
bad_filenames = ["/Users/guilhermematos/.thermoml/j.fluid.2013.12.014.xml"]
df = df[~df.filename.isin(bad_filenames)]

In [5]:
experiments = ["Activity coefficient","(Relative) activity"]

In [6]:
ind_list = [df[exp].dropna().index for exp in experiments]
ind = reduce(lambda x,y: x.union(y), ind_list)
df = df.loc[ind]

In [7]:
# Extract rows with two components
df["n_components"] = df.components.apply(lambda x: len(x.split("__")))
df = df[df.n_components == 2]
df.dropna(axis=1, how='all', inplace=True)

In [8]:
# Separate components nominally
df["component_0"] = df.components.apply(lambda x: x.split("__")[0])
df["component_1"] = df.components.apply(lambda x: x.split("__")[1])

In [9]:
# Find names
name_to_formula = pd.read_hdf("/Users/guilhermematos/.thermoml/compound_name_to_formula.h5", 'data')
name_to_formula = name_to_formula.dropna()

In [10]:
# Add formulas to the table
df["formula_0"] = df.component_0.apply(lambda chemical: name_to_formula[chemical])
df["formula_1"] = df.component_1.apply(lambda chemical: name_to_formula[chemical])

In [11]:
heavy_atoms = ["C","O","N","P","S","Cl","F"]
desired_atoms = ["H"] + heavy_atoms

In [12]:
# Add extra information
df["n_atoms_0"] = df.formula_0.apply(lambda formula_string : thermoml_lib.count_atoms(formula_string))
df["n_heavy_atoms_0"] = df.formula_0.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, heavy_atoms))
df["n_desired_atoms_0"] = df.formula_0.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, desired_atoms))
df["n_other_atoms_0"] = df.n_atoms_0 - df.n_desired_atoms_0

df["n_atoms_1"] = df.formula_1.apply(lambda formula_string : thermoml_lib.count_atoms(formula_string))
df["n_heavy_atoms_1"] = df.formula_1.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, heavy_atoms))
df["n_desired_atoms_1"] = df.formula_1.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, desired_atoms))
df["n_other_atoms_1"] = df.n_atoms_1 - df.n_desired_atoms_1

In [13]:
# Remove systems that have atoms outside `desired_atoms`
df = df[df.n_other_atoms_0 == 0]
df = df[df.n_other_atoms_1 == 0]
df.dropna(axis=1, how='all', inplace=True)

In [14]:
# Add SMILES string for each component
df["SMILES_0"] = df.component_0.apply(lambda x: resolve_cached(x, "smiles"))
df = df[df.SMILES_0 != None]
df.dropna(subset=["SMILES_0"], inplace=True)
df = df.loc[df.SMILES_0.dropna().index]

df["SMILES_1"] = df.component_1.apply(lambda x: resolve_cached(x, "smiles"))
df = df[df.SMILES_1 != None]
df.dropna(subset=["SMILES_1"], inplace=True)
df = df.loc[df.SMILES_1.dropna().index]

In [15]:
df["cas_0"] = df.component_0.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "cas")))
df["InChI_0"] = df.component_0.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "stdinchikey")))

df["cas_1"] = df.component_1.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "cas")))
df["InChI_1"] = df.component_1.apply(lambda x: thermoml_lib.get_first_entry(resolve_cached(x, "stdinchikey")))

df = df[df.cas_0 != None]
df = df.loc[df.cas_0.dropna().index]
df = df[df.cas_1 != None]
df = df.loc[df.cas_1.dropna().index]

In [16]:
#Extract rows with temperatures between 250 and 400 K
df = df[df['Temperature, K'] > 250.]
df = df[df['Temperature, K'] < 400.]

In [17]:
# Extract rows with pressure between 101.325 kPa and 101325 kPa
df = df[df['Pressure, kPa'] > 100.]
df = df[df['Pressure, kPa'] < 102000.]

In [18]:
# Strip rows not in liquid phase
df = df[df['phase']=='Liquid']

In [19]:
# Cleanup
df.dropna(axis=1, how='all', inplace=True)

In [20]:
# Rename
df["filename"] = df["filename"].map(lambda x: x.lstrip('/Users/guilhermematos/.thermoml/').rstrip('.xml'))

In [21]:
# More cleanup
df = df[df.n_heavy_atoms_0 > 0]
df = df[df.n_heavy_atoms_0 <= 40]
df = df[df.n_heavy_atoms_1 > 0]
df = df[df.n_heavy_atoms_1 <= 40]
df.dropna(axis=1, how='all', inplace=True)

In [22]:
# Get data that contains mole fractions
ind_f = df["Mole fraction"].dropna().index
df_fraction = df.loc[ind_f]
df_fraction.dropna(axis=1, how='all', inplace=True)

In [24]:
keys = ["filename","component_0","component_1","SMILES_0","SMILES_1","cas_0",
        "cas_1","InChI_0","InChI_1","Temperature, K","Pressure, kPa",
        "Activity coefficient","Activity coefficient_std",
        "Mole fraction"]

dfnew = pd.concat([df_fraction['filename'],df_fraction['component_0'],df_fraction['component_1'],
                   df_fraction['SMILES_0'],df_fraction['SMILES_1'],df_fraction["cas_0"],
                   df_fraction["cas_1"],df_fraction["InChI_0"],df_fraction["InChI_1"],
                   df_fraction["Temperature, K"],df_fraction["Pressure, kPa"],
                   df_fraction["Activity coefficient"],df_fraction["Activity coefficient_std"],
                   df_fraction["Mole fraction"]], 
                   axis = 1, keys = keys)

a = dfnew["filename"].value_counts()
a = a.reset_index()
a.rename(columns={"index":"Filename", "filename":"Count"},inplace=True)

b0 = dfnew["InChI_0"].value_counts()
b0 = b0.reset_index()
b0.rename(columns={"index":"InChI","InChI":"Count"},inplace=True)
b0["Component"] = b0.InChI.apply(lambda x: resolve_cached(x, "iupac_name"))
b0["SMILES"] = b0.InChI.apply(lambda x: resolve_cached(x, "smiles"))

b1 = dfnew["InChI_1"].value_counts()
b1 = b1.reset_index()
b1.rename(columns={"index":"InChI","InChI":"Count"},inplace=True)
b1["Component"] = b1.InChI.apply(lambda x: resolve_cached(x, "iupac_name"))
b1["SMILES"] = b1.InChI.apply(lambda x: resolve_cached(x, "smiles"))

In [28]:
from os.path import join

# Save all data
datapath = getcwd()

csvall = "alldata.csv"
pickleall = "alldata.pickle"
df.to_csv(join(datapath,csvall))
df.to_pickle(join(datapath,pickleall))

csvfraction = "molefraction.csv"
picklefraction = "molefraction.pickle"
df_fraction.to_csv(join(datapath,csvfraction))
df_fraction.to_pickle(join(datapath,picklefraction))