# *Heterogeneous Catalyst Database*

## Import Data

In [59]:
import pandas as pd
import json

data=json.load(open('dataset.json')) #download json file as data

In [75]:
df=pd.json_normalize(data["results"]) #we are interested in the "results" column and so we normalize all columns. 
df

Unnamed: 0,adsorption_measurement.adsorption_energy,adsorption_measurement.doi,adsorption_measurement.external_note,adsorption_measurement.internal_note,adsorption_measurement.is_most_stable_site,adsorption_measurement.method.software,adsorption_measurement.method.exchange_correlation,adsorption_measurement.method.basis_set,adsorption_measurement.method.potentials,adsorption_measurement.method.is_spin_polarization,...,adsorption_measurement.emn_user.affiliation,adsorption_measurement.emn_user.email,adsorption_measurement.approver,adsorption_measurement.adsorption_site,adsorption_measurement.bulk_surface_property_set.lattice_constant,adsorption_measurement.bulk_surface_property_set.cell_symmetry,adsorption_measurement.bulk_surface_property_set.secondary_bulk_class.name,adsorption_measurement.bulk_surface_property_set.first_layer_composition.name,adsorption_measurement.bulk_surface_property_set.second_layer_composition.name,adsorption_measurement.bulk_surface_property_set.facet
0,-4.29,10.1016/S0039-6028(01)01464-9,,,True,Dacapo,PW91,plane wave,ultrasoft pseudopotential,,...,NREL,Tuong.Bui@nrel.gov,,,,,,,,
1,-0.56,10.1016/S0039-6028(01)01464-9,,,True,Dacapo,PW91,plane wave,ultrasoft pseudopotential,,...,NREL,Tuong.Bui@nrel.gov,,,,,,,,
2,-3.14,10.1016/0039-6028(95)01141-2,,,True,CETEP,PW91,plane wave,norm-conserving pseudopotential,True,...,NREL,Matthew.Jankousky@nrel.gov,,,,,,,,
3,-0.5,10.1016/0039-6028(95)01141-2,,,True,CETEP,PW91,plane wave,norm-conserving pseudopotential,True,...,NREL,Tuong.Bui@nrel.gov,,,,,,,,
4,-1.67,10.1016/S0039-6028(99)00489-6,,,True,VASP,PW91,plane wave,ultrasoft pseudopotential,,...,NREL,Matthew.Jankousky@nrel.gov,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-1.72,10.1021/acscatal.8b00201,,The original Rh data for this paper is from an...,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,...,NREL,Matthew.Jankousky@nrel.gov,,,,,,,,
1996,1.66,10.1021/acscatal.8b00201,,The original Rh data for this paper is from an...,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,...,NREL,Matthew.Jankousky@nrel.gov,,,,,,,,
1997,-1.57,10.1021/acscatal.8b00201,,The original Rh data for this paper is from an...,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,...,NREL,Matthew.Jankousky@nrel.gov,,,,,,,,
1998,-2.51,10.1021/acscatal.8b00201,,The original Rh data for this paper is from an...,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,...,NREL,Matthew.Jankousky@nrel.gov,,,,,,,,


As shown above, there are 51 columns when they are all stretched out.\
Some of these values are clearly not relevant and so the data must be organized.

In [76]:
df.to_csv("dataset.csv", index=False)

## Organize Data
Columns that are chemically and physically relevant for predicting adsorption energies will only be used.\
Out of the 51 columns, we must pick out attributes that are relevant.\
Initially, the trivially irrelevant attributes such as doi, author, and attributes with no data are dropped from the dataset.\
First, the obvious ones are the following:\
['adsorption_measurement.doi', 'adsorption_measurement.external_note', 'adsorption_measurement.internal_note', 'adsorption_measurement.bulk_surface_property_set.secondary_bulk_class', 'adsorption_measurement.bulk_surface_property_set.second_layer_composition', 'adsorption_measurement.bulk_surface_property_set.first_layer_composition', 'adsorption_measurement.emn_user.first_name', 'adsorption_measurement.emn_user.last_name', 'adsorption_measurement.emn_user.affiliation', 'adsorption_measurement.emn_user.email', 'adsorption_measurement.approver', 'adsorption_measurement.adsorption_site', 'adsorption_measurement.bulk_surface_property_set.lattice_constant', 'adsorption_measurement.bulk_surface_property_set.cell_symmetry', 'adsorption_measurement.bulk_surface_property_set.secondary_bulk_class.name', 'adsorption_measurement.bulk_surface_property_set.first_layer_composition.name', 'adsorption_measurement.bulk_surface_property_set.second_layer_composition.name', 'adsorption_measurement.bulk_surface_property_set.facet']

In [77]:
obvious_drops = ['adsorption_measurement.doi', 'adsorption_measurement.external_note', 'adsorption_measurement.internal_note', 'adsorption_measurement.bulk_surface_property_set.secondary_bulk_class', 'adsorption_measurement.bulk_surface_property_set.second_layer_composition', 'adsorption_measurement.bulk_surface_property_set.first_layer_composition', 'adsorption_measurement.emn_user.first_name', 'adsorption_measurement.emn_user.last_name', 'adsorption_measurement.emn_user.affiliation', 'adsorption_measurement.emn_user.email', 'adsorption_measurement.approver', 'adsorption_measurement.adsorption_site', 'adsorption_measurement.bulk_surface_property_set.lattice_constant', 'adsorption_measurement.bulk_surface_property_set.cell_symmetry', 'adsorption_measurement.bulk_surface_property_set.secondary_bulk_class.name', 'adsorption_measurement.bulk_surface_property_set.first_layer_composition.name', 'adsorption_measurement.bulk_surface_property_set.second_layer_composition.name', 'adsorption_measurement.bulk_surface_property_set.facet']
df_obvious_drops = df.drop(columns=obvious_drops)
df_obvious_drops

Unnamed: 0,adsorption_measurement.adsorption_energy,adsorption_measurement.is_most_stable_site,adsorption_measurement.method.software,adsorption_measurement.method.exchange_correlation,adsorption_measurement.method.basis_set,adsorption_measurement.method.potentials,adsorption_measurement.method.is_spin_polarization,adsorption_measurement.method.is_zero_point_energy,adsorption_measurement.method.is_fixed_substrate,adsorption_measurement.adsorbate_species.formula,...,adsorption_measurement.bulk_surface_property_set.lattice_constant.lattice_constant_1,adsorption_measurement.bulk_surface_property_set.lattice_constant.lattice_constant_2,adsorption_measurement.bulk_surface_property_set.lattice_constant.lattice_constant_3,adsorption_measurement.bulk_surface_property_set.facet.name,adsorption_measurement.bulk_surface_property_set.cell_symmetry.name,adsorption_measurement.adsorbate_fraction.fraction,adsorption_measurement.adsorbate_fraction.numeric,adsorption_measurement.adsorbate_fraction.unit,adsorption_measurement.adsorption_site.site_name,adsorption_measurement.adsorption_reference_species_set
0,-4.29,True,Dacapo,PW91,plane wave,ultrasoft pseudopotential,,,False,O,...,3.660,,,(111),2x2,1/4,0.25,Cu,fcc,"[{'reference_coefficient': 1, 'species': {'for..."
1,-0.56,True,Dacapo,PW91,plane wave,ultrasoft pseudopotential,,,False,O2,...,3.660,,,(111),2x2,1/4,0.25,Cu,b-h-b,"[{'reference_coefficient': 1, 'species': {'for..."
2,-3.14,True,CETEP,PW91,plane wave,norm-conserving pseudopotential,True,,,O,...,4.190,,,(110),3x2,1/6,0.1667,Ag,fourfold hollow,"[{'reference_coefficient': 1, 'species': {'for..."
3,-0.5,True,CETEP,PW91,plane wave,norm-conserving pseudopotential,True,,,O2,...,4.190,,,(110),3x2,1/6,0.1667,Ag,fourfold hollow (1-10),"[{'reference_coefficient': 1, 'species': {'for..."
4,-1.67,True,VASP,PW91,plane wave,ultrasoft pseudopotential,,,,O2,...,3.532,,,(111),4x2,1/4,0.25,Ni,t-hcp-b,"[{'reference_coefficient': 1, 'species': {'for..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-1.72,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CH,...,,,,(111),3x3,1/9,0.1111,Rh,terrace,"[{'reference_coefficient': 1, 'species': {'for..."
1996,1.66,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CH,...,,,,(111),3x3,1/9,0.1111,Ag,terrace,"[{'reference_coefficient': 1, 'species': {'for..."
1997,-1.57,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CH3O,...,,,,(111),3x3,1/9,0.1111,Rh,terrace,"[{'reference_coefficient': 1, 'species': {'for..."
1998,-2.51,True,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CHCO,...,,,,(111),3x3,1/9,0.1111,Ir,terrace,"[{'reference_coefficient': 1.5, 'species': {'f..."


Now, 51 columns were now reduced to 33 columns.\
Additionally, there are columns that isn't relevant to this study or doesn't have any variability that would not contribute to our predictions. Therefore, these columns will also be manually droped. 
The additionally dropped columns are the following:\
['adsorption_measurement.is_most_stable_site', 'adsorption_measurement.adsorbate_species.name', 'adsorption_measurement.adsorbate_species.elemental_formula', 'adsorption_measurement.bulk_surface_property_set.is_stretched', 'adsorption_measurement.bulk_surface_property_set.is_compressed', 'adsorption_measurement.bulk_surface_property_set.nano_number_of_atoms', 'adsorption_measurement.bulk_surface_property_set.bulk_surface_material.elemental_formula', 'adsorption_measurement.adsorbate_fraction.fraction', 'adsorption_measurement.adsorption_reference_species_set', 'adsorption_measurement.bulk_surface_property_set.primary_bulk_class.name']

In [81]:
additional_drops = ['adsorption_measurement.is_most_stable_site', 'adsorption_measurement.adsorbate_species.name', 'adsorption_measurement.adsorbate_species.elemental_formula', 'adsorption_measurement.bulk_surface_property_set.is_stretched', 'adsorption_measurement.bulk_surface_property_set.is_compressed', 'adsorption_measurement.bulk_surface_property_set.nano_number_of_atoms', 'adsorption_measurement.bulk_surface_property_set.bulk_surface_material.elemental_formula', 'adsorption_measurement.adsorbate_fraction.fraction', 'adsorption_measurement.adsorption_reference_species_set', 'adsorption_measurement.bulk_surface_property_set.primary_bulk_class.name']
df_additional_drops = df_obvious_drops.drop(columns=additional_drops)
df_additional_drops

Unnamed: 0,adsorption_measurement.adsorption_energy,adsorption_measurement.method.software,adsorption_measurement.method.exchange_correlation,adsorption_measurement.method.basis_set,adsorption_measurement.method.potentials,adsorption_measurement.method.is_spin_polarization,adsorption_measurement.method.is_zero_point_energy,adsorption_measurement.method.is_fixed_substrate,adsorption_measurement.adsorbate_species.formula,adsorption_measurement.adsorbate_species.smiles,...,adsorption_measurement.bulk_surface_property_set.space_group,adsorption_measurement.bulk_surface_property_set.bulk_surface_material.name,adsorption_measurement.bulk_surface_property_set.lattice_constant.lattice_constant_1,adsorption_measurement.bulk_surface_property_set.lattice_constant.lattice_constant_2,adsorption_measurement.bulk_surface_property_set.lattice_constant.lattice_constant_3,adsorption_measurement.bulk_surface_property_set.facet.name,adsorption_measurement.bulk_surface_property_set.cell_symmetry.name,adsorption_measurement.adsorbate_fraction.numeric,adsorption_measurement.adsorbate_fraction.unit,adsorption_measurement.adsorption_site.site_name
0,-4.29,Dacapo,PW91,plane wave,ultrasoft pseudopotential,,,False,O,[O],...,Fm3m,Cu,3.660,,,(111),2x2,0.25,Cu,fcc
1,-0.56,Dacapo,PW91,plane wave,ultrasoft pseudopotential,,,False,O2,O=O,...,Fm3m,Cu,3.660,,,(111),2x2,0.25,Cu,b-h-b
2,-3.14,CETEP,PW91,plane wave,norm-conserving pseudopotential,True,,,O,[O],...,Fm3m,Ag,4.190,,,(110),3x2,0.1667,Ag,fourfold hollow
3,-0.5,CETEP,PW91,plane wave,norm-conserving pseudopotential,True,,,O2,O=O,...,Fm3m,Ag,4.190,,,(110),3x2,0.1667,Ag,fourfold hollow (1-10)
4,-1.67,VASP,PW91,plane wave,ultrasoft pseudopotential,,,,O2,O=O,...,,Ni,3.532,,,(111),4x2,0.25,Ni,t-hcp-b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-1.72,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CH,[CH],...,Fm3m,Rh,,,,(111),3x3,0.1111,Rh,terrace
1996,1.66,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CH,[CH],...,,Ag,,,,(111),3x3,0.1111,Ag,terrace
1997,-1.57,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CH3O,C[O],...,Fm3m,Rh,,,,(111),3x3,0.1111,Rh,terrace
1998,-2.51,Quantum ESPRESSO,BEEF-vdW,plane wave,projector augmented wave,,,False,CHCO,[O]C#C,...,,Ir,,,,(111),3x3,0.1111,Ir,terrace


Now we are down to 23 columns!\
The attributes that are left are ones that have some level of chemical importance.\
Therefore, we need to assess the chemical importance of each attribute. 