# Describe the Dataset and Publish it to Foundry
We are going to define the names of the columns then how to read it

In [1]:
from foundry import Foundry
import pandas as pd

## Create the Foundry Object
This is how we'll be connecting to the web service

In [2]:
f = Foundry()

## Make describes for each column
We want to make sure we can read the whole thing

In [3]:
data = pd.read_csv('QMC_AMIONS_NI_LE.csv')

In [4]:
data.columns

Index(['fragment', 'smiles', 'inchi', 'xyz', 'HF', 'DMC(HF)', 'DMC(HF)_err',
       'PBE', 'DMC(PBE)', 'DMC(PBE)_err', 'PBE0', 'DMC(PBE0)', 'DMC(PBE0)_err',
       'B3LYP', 'DMC(B3LYP)', 'DMC(B3LYP)_err'],
      dtype='object')

Start with the identifiers

In [5]:
keys = [
    {'key': 'fragment', 'units': '', 'type': 'input', 'description': 'Fragment ID - to match with full dataset'},
    {'key': 'smiles', 'units': '', 'type': 'input', 'description': 'SMILES string of the molecule'},
    {'key': 'inchi', 'units': '', 'type': 'input', 'description': 'InChI string of the molecule'},
    {'key': 'xyz', 'units': '', 'type': 'input', 'description': 'XYZ-formatted string of the molecule structure'},
]

Generate the description of the energy-related keys procedurally

In [6]:
for trial_method in ['HF', 'PBE', 'PBE0', 'B3LYP']: 
    qchem_desc = 'an all-electron computation using '
    qchem_desc += ('the Hartree-Fock method' if trial_method == 'HF' else f'Density-functional theory (DFT) with the {trial_method} exchange-correlation functional.')
    keys.extend([
        {'key': trial_method, 'units': 'Ha', 'type': 'target', 'description': 'Energy from ' + qchem_desc},
        {'key': f'DMC({trial_method})', 'units': 'Ha', 'type': 'target', 'description': 'Energy from a Diffusion Monte Carlo calculation using a trial wavefunction from ' + qchem_desc},
        {'key': f'DMC({trial_method})_err', 'units': 'Ha', 'type': 'target', 'description': f'Uncertainty associated with DMC({trial_method})'}
    ])

Make sure we got all of the columns in the correct order

In [7]:
assert data.columns.tolist() == [x['key'] for x in keys]

In [16]:
dataset_metadata = {
    'keys': keys,
    'splits': [{'type': 'train', 'path': 'QMC_AMIONS_NI_LE.csv'}],
    'type': 'tabular',
}

## Define the rest of the metadata
Things needed to build the record

In [8]:
description = ('This dataset contains summary inputs and outputs generated for the Paper "Approaching QMC quality energetics throughout chemical space using scalable quantum machine learning"'
               'By B. Huang, O. Anatole von Lilienfeld, J. T. Krogel and A. Benali. '
               'The dataset includes energies for 1175 molecules calculated with varying methods, uncertainties associated with the DMC calculations, and molecular descriptions in XYZ, SMILES, and InChI format. '
               'Raw data for these calculations are available at https://doi.org/10.18126/hxlp-v732')

In [9]:
title = 'Foundry - Approaching QMC quality energetics throughout chemical space using scalable quantum machine learning'

In [10]:
authors = ['Huang, Bing', 'von Lilienfeld, O.', 'Krogel, Jaron T.', 'Benali, Anouar']

In [11]:
short_name = 'TBD'

In [12]:
affiliations = [['University of Vienna'], ['University of Toronto, Technische Universitat Berlin'], ['Oak Ridge National Laboratory'], ['Argonne National Laboratory']]

In [13]:
related_dois = ['10.18126/hxlp-v732']

In [14]:
publisher = "Materials Data Facility"

In [15]:
publication_year = 2021