In [1]:
import numpy as np
import pandas as pd
import os
from pymatgen import MPRester
import re
import time

In the above cell all imports are handled.
<br>`from pymatgen import MPRester` is used as an API tool to download the formula and bandwidth of many catalyst

In [2]:
exclude = ['O', 'H', 'He', 'Ne', 'Ar', 'As', 'Se', 'Kr', 'Tc', 'Xe', 'Pm', 'Hf', 'Os', 'Ir', 'Hg', 'Tl', 'Ac', 'Pa', 'U', 'Np', 'Pu']
include = ['Ag', 'Al', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F', 'Fe', 'Ga', 'Gd', 'Ge', 'Ho', 'I', 'In', 'K', 'La', 'Li', 'Lu', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ni', 'P', 'Pb', 'Pd', 'Pr', 'Pt', 'Rb', 'Re', 'Rh', 'Ru', 'S', 'Sb', 'Sc', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Te', 'Th', 'Ti', 'Tm', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr']
cloumn_order = include + ['band_gap']

`exclude`: Catalysts with these elements will be excludet in the dataset.
<br>`include`: Catalysts with these elements will be includuet in the dataset.
<br>`cloumn_order` This list will be used to order the columns in the dataset.

In [3]:
m = MPRester('ifLyENSM2ZCJUWrl') 
data = m.query(criteria={"elements": {"$nin": exclude}, "nelements": {"$lt": 9}, "band_gap": {"$gt": 1e-4}}, properties=["pretty_formula", "band_gap"])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=14594.0), HTML(value='')))

`m = MPRester('ifLyENSM2ZCJUWrl') ` builds a connection to the online database and 
<br>`data = m.query(...)` downloads the formula and band_gap of the catalysts

In [11]:
def composition_from_formula(formula):
    name = formula["pretty_formula"]
    band_gap = formula["band_gap"]
    elements_and_numbers = re.findall('[A-Z][^A-Z]*', name)
    print(elements_and_numbers)
    columns = [re.sub(r'[0-9]+', '', s) for s in elements_and_numbers]
    columns.append('band_gap')
    stoechiometric_faktors = [1  if not any(char.isdigit() for char in element) else int(re.findall(r'\d+', element)[0]) for element in elements_and_numbers]
    max_faktor = max(stoechiometric_faktors)
    content = [faktor / max_faktor for faktor in stoechiometric_faktors]
    content.append(band_gap)
    composition = {'{}'.format(name): content}
    return composition, columns   


d = data[4]
print(d)
f = composition_from_formula(d)
f

{'pretty_formula': 'Ba(CrN2)2', 'band_gap': 0.021700000000000053}
['Ba(', 'Cr', 'N2)2']


({'Ba(CrN2)2': [0.5, 0.5, 1.0, 0.021700000000000053]},
 ['Ba(', 'Cr', 'N)', 'band_gap'])

The function `composition_from_formula` will take a 'formula'-dictionary form the downloadet `data` a and return the compositions, the column-names and the index of the formula. 
<br>The `'pretty-formula'`-string is extracted from the dictionary and stored in the value `name`.
<br>The value of the band-gap is extracted from the dictionary and stored in the value `band_gap`.
<br>The `name`-string is split before each capital letter, resulting in a list of the elements and the stoechiometric faktors.
<br>Example: `name = 'Ba3PN'` -> `elements_and_numbers = ['Ba3', 'P', 'N']`
<br>In the `columns`-variable is a version of the `elements_and_numbers`-variable without any digits saved. This variable will be used as the columns for the DataFrame-creation later. `'band_gap'` is addet to the columns variable. 
<br>For each element in the `elements_and_numbers`-list a 1, if there is no integer in the element, or the number, if there is an integer, is addet to the `stoechiometric_faktors`-list.
<br>Example:  `elements_and_numbers = ['Ba3', 'P', 'N']` -> `stoechiometric_faktors = [3, 1, 1]`
<br>`content` is a list where all elements of the `stoechiometric_faktors`-list are normalised.
<br>The `band_gap`-variable is addet to the `content`-list.
<br>The `compositions`-dictionary is construckted. It contains one item, with the key as the `name`-string and the value as the `content`-list.
<br> The `compostitions`-dictionary and the `columns`-list is returned.

In [5]:
def create_dataset(data):
    start_time = time.time()
    converted_datapoints = [composition_from_formula(datapoint) for datapoint in data]
    print('''__________________________________________________________
    
Converting data-points to small pandas.DataFrame objects.
    ''')
    datapoints_as_dataframes = [pd.DataFrame.from_dict(data=datapoint[0], columns=datapoint[1], orient='index') for datapoint in converted_datapoints]
    print('duration: {:.1f}s'.format(time.time() - start_time))
    start_time = time.time()
    print('''__________________________________________________________
    
Concatenating data-points to dataset.
    ''')
    dataset = pd.concat(datapoints_as_dataframes).fillna(0.)
    dataset = dataset[cloumn_order]
    print('duration: {:.1f}s'.format(time.time() - start_time))
    return dataset
       
    
dataset = create_dataset(data)
dataset


__________________________________________________________
    
Converting data-points to small pandas.DataFrame objects.
    
duration: 14.1s
__________________________________________________________
    
Concatenating data-points to dataset.
    
duration: 127.5s


Unnamed: 0,Ag,Al,Au,B,Ba,Be,Bi,Br,C,Ca,...,Th,Ti,Tm,V,W,Y,Yb,Zn,Zr,band_gap
Ba3PN,0.0,0.000000,0.0,0.0,1.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0255
CrN2,0.0,0.000000,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0173
Mg2MnN3,0.0,0.000000,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0171
Li3CoN2,0.0,0.000000,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0004
Ba(CrN2)2,0.0,0.000000,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CsYbF3,0.0,0.000000,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,7.4310
LiPF6,0.0,0.000000,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,7.6300
ErF3,0.0,0.000000,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,7.8677
SrBeF4,0.0,0.000000,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,7.5240


In [8]:
dataset.head()

Unnamed: 0,Ag,Al,Au,B,Ba,Be,Bi,Br,C,Ca,...,Th,Ti,Tm,V,W,Y,Yb,Zn,Zr,band_gap
Ba3PN,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0255
CrN2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0173
Mg2MnN3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0171
Li3CoN2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0004
Ba(CrN2)2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0217


In [10]:
d = data[4]
print(d)
f = composition_from_formula(d)
print(f)

{'pretty_formula': 'Ba(CrN2)2', 'band_gap': 0.021700000000000053}
({'Ba(CrN2)2': [0.5, 0.5, 1.0, 0.021700000000000053]}, ['Ba(', 'Cr', 'N)', 'band_gap'])
