In [1]:
from pymatgen.ext.matproj import MPRester
import itertools
elems = {'Co', 'Ni', 'W', 'Al', 'Cr', 'Fe', 'Ti', 'Cu'}
bin_syses = []
ter_syses = []
for els in itertools.combinations(elems, 2):
    bin_syses.append("-".join(sorted(els)))
for els in itertools.combinations(elems, 3):
    ter_syses.append("-".join(sorted(els)))
results = []
with MPRester('qPup5BgsXVImhF74') as m:
    results += m.get_entries({"chemsys": {"$in": bin_syses}})
    results += m.get_entries({"chemsys": {"$in": ter_syses}})
    # results = m.query("Co*", ['pretty_formula', 'energy'])
    # results = m.get_entries_in_chemsys(['Co', 'Al'])
structures = []
with MPRester('qPup5BgsXVImhF74') as m:
    for els in bin_syses:
        structures += m.get_structures(els)
    for els in ter_syses:
        structures += m.get_structures(els)
    # structures += m.get_structures({"chemsys": {"$in": ter_syses}})

## Use `query` to get manageable data

In [1]:
from pymatgen.ext.matproj import MPRester
import itertools
# elems = {'Co', 'Al', 'W', 'Ni', 'Pb', 'Te'}
elems = {'Co', 'Ni', 'W', 'Al', 'Cr', 'Fe', 'Ti', 'Cu'}
# Binary, ternary combinations of the elements
bin_syses = [list(els) for els in itertools.combinations(elems, 2)]
ter_syses = [list(els) for els in itertools.combinations(elems, 3)]
results = []
with MPRester('qPup5BgsXVImhF74') as m:
    properties = ["formula", "pretty_formula", "nelements", "nsites", "volume", "spacegroup", "cif", 
    "formation_energy_per_atom", "e_above_hull", "band_gap", "dos", "energy", "energy_per_atom", "icsd_ids"]  # structures is complex
    for els in elems:
        results += m.query(criteria={"elements": {"$all": [els]}, "nelements": 1}, properties=properties)
    for bin_sys in bin_syses:
        results += m.query(criteria={"elements": {"$all": bin_sys}, "nelements": 2}, properties=properties)
    for ter_sys in ter_syses:
        results += m.query(criteria={"elements": {"$all": ter_sys}, "nelements": 3}, properties=properties)

## Save to json

In [5]:
import json
outpath = 'D:/PSED/DataCollect/MP/'
name_cnt = {}
for i in range(len(results)):
    cur = results[i]
    cur_formula = cur['pretty_formula']  # For file naming
    # Add num to duplicate names
    name_cnt[cur_formula] = name_cnt[cur_formula]+1 if cur_formula in name_cnt else 0
    with open(outpath + cur['pretty_formula'] + '_' + str(name_cnt[cur_formula]) + '.json', 'w') as outfile:
        json.dump(cur, outfile)


In [3]:
len(results)

340

## Data preparation
Load MP/OQMD data from `json` files and covert to `DataFrame`.

In [6]:
import os, json
import pandas as pd
mp_path = 'D:/PSED/DataCollect/MP/'

mp_files = os.listdir(mp_path)


# List of predictors and response. Energy is formation energy per atom (delta E).
features = ['Co', 'Ni', 'W', 'Al', 'Cr', 'Fe', 'Ti', 'Cu', 'structure', 'volume', 'bandgap', 'natoms', 'ntypes', 'energy']

# Organized as table.
alloy_data = pd.DataFrame(columns=features)

for idx, file in enumerate(mp_files):
    with open(mp_path + file) as f:
        data = json.load(f)
        cur_entry = dict.fromkeys(features, 0)
        for els in data['formula']:
            cur_entry[els] = data['formula'][els]
        cur_entry['structure'] = data['spacegroup']['symbol']
        cur_entry['crystal_system'] = data['spacegroup']['crystal_system']
        cur_entry['volume'] = data['volume']
        cur_entry['bandgap'] = data['band_gap']
        cur_entry['natoms'] = data['nsites']
        cur_entry['ntypes'] = data['nelements']
        cur_entry['energy'] = data['formation_energy_per_atom']
        alloy_data = pd.concat([alloy_data, pd.DataFrame(cur_entry, index=[idx])])

In [7]:
alloy_data.to_csv('D:/PSED/MP_data.csv')
data

{'formula': {'W': 1.0},
 'pretty_formula': 'W',
 'nelements': 1,
 'nsites': 4,
 'volume': 67.28638234722543,
 'spacegroup': {'symprec': 0.1,
  'source': 'spglib',
  'symbol': 'Pbcm',
  'number': 57,
  'point_group': 'mmm',
  'crystal_system': 'orthorhombic',
  'hall': '-P 2c 2b'},
 'cif': "# generated using pymatgen\ndata_W\n_symmetry_space_group_name_H-M   'P 1'\n_cell_length_a   2.80966200\n_cell_length_b   4.86387000\n_cell_length_c   4.92369500\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   90.00000000\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   1\n_chemical_formula_structural   W\n_chemical_formula_sum   W4\n_cell_volume   67.28638235\n_cell_formula_units_Z   4\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  W  W0  1  0.79317200  0.56315700  0.250000

In [4]:
import chemparse
oqmd_path = 'D:/PSED/DataCollect/OQMD/'
oqmd_files = os.listdir(oqmd_path)

alloy_data1 = pd.DataFrame(columns=features)
oqmd_files = []
for (dirpath, dirnames, filenames) in os.walk(oqmd_path):
    for file in filenames:
        if file.endswith('json'):
            oqmd_files.append(os.path.join(dirpath, file))
idx = 0
for file in oqmd_files:
    with open(file) as f:
        dataset = json.load(f)['data']
        for data in dataset:
            formula = chemparse.parse_formula(data['name'])
            cur_entry = dict.fromkeys(features, 0)
            for els in formula:
                cur_entry[els] = formula[els]
            cur_entry['structure'] = data['spacegroup']
            cur_entry['volume'] = data['volume']
            cur_entry['bandgap'] = data['band_gap']
            cur_entry['natoms'] = data['natoms']
            cur_entry['ntypes'] = data['ntypes']
            cur_entry['energy'] = data['delta_e']
            alloy_data1 = pd.concat([alloy_data1, pd.DataFrame(cur_entry, index=[idx])])
            idx += 1

In [5]:
data

{'name': 'CoW2',
 'entry_id': 1487152,
 'icsd_id': None,
 'composition_generic': 'AB2',
 'prototype': None,
 'spacegroup': 'P63/mmc',
 'volume': 386.695,
 'ntypes': 2,
 'natoms': 24,
 'band_gap': 0.0,
 'delta_e': 0.815423025,
 'stability': 0.856845761252439}

In [13]:

alloy_data1.to_csv('D:/PSED/OQMD_data.csv')