## Data Processing
This notebook process the OQMD data containing delta_e, volume_pa and bandgap containing 620K entries, 
after removing outliers [delta_e out outside (-20,5) and outside 5 std], we end up with 307K unique entries. 
We compute the physical attributes and elemental fractions, the whole processed data is saved at oqmd_all.csv. 
The train and test are available at respective csv files containing all, only physical or only fractions.

In [1]:
import re, numpy as np, os, sys, pandas
from pymatgen import Composition
import data_utils
import magpie
from pymatgen import Composition
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
from matminer.utils.conversions import str_to_composition
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
oqmd_data_path = 'training-data/oqmd-c.csv'
oqmd_data = pandas.read_csv(oqmd_data_path, sep=r'\s*', na_values= 'None')
oqmd_data.info()




split() requires a non-empty pattern match.


split() requires a non-empty pattern match.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620196 entries, 0 to 620195
Data columns (total 7 columns):
comp         620196 non-null object
energy_pa    620189 non-null float64
volume_pa    620189 non-null float64
magmom_pa    455663 non-null float64
bandgap      619819 non-null float64
delta_e      620196 non-null float64
stability    581531 non-null float64
dtypes: float64(6), object(1)
memory usage: 33.1+ MB


In [3]:
oqmd_data.describe()

Unnamed: 0,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability
count,620189.0,620189.0,455663.0,619819.0,620196.0,581531.0
mean,-5.434664,20.86548,0.406732,0.130374,0.007116,0.58325
std,2.615643,7.870184,0.556038,0.644729,1.996298,2.905439
min,-203.629754,2.72938,-4.248135,0.0,-198.69561,-198.683327
25%,-6.731937,15.3211,0.000346,0.0,-0.254396,0.207708
50%,-5.382519,19.0389,0.175463,0.0,0.110805,0.421494
75%,-4.076177,24.8237,0.717187,0.0,0.467793,0.713214
max,1122.552855,203.685,5.923581,18.437,1126.321181,1126.858389


### Creating a sample small dataset by taking only 11,000 samples

In [4]:
oqmd_data = oqmd_data[:11000]

In [5]:
oqmd_data.rename(columns={'comp':'composition'}, inplace=True)
oqmd_data[:2]

Unnamed: 0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability
0,Cs1Ho1S4Si1,-5.353489,27.1652,6.9e-05,3.024,-1.60894,-0.064029
1,Lu1,-4.511592,28.7838,0.046445,0.0,0.01259,0.01259


In [6]:
oqmd_data.query('delta_e > -20 and delta_e < 5', inplace=True)
oqmd_data = oqmd_data[~oqmd_data['delta_e'].isnull()]
oqmd_data.shape

(11000, 7)

In [7]:
%%time
oqmd_data['comp_obj'] = oqmd_data['composition'].apply(lambda x: Composition(x))

CPU times: user 267 ms, sys: 3.68 ms, total: 271 ms
Wall time: 269 ms


In [8]:
%%time
oqmd_data['pretty_comp'] = oqmd_data['comp_obj'].apply(lambda x: x.reduced_formula)


No electronegativity for Ne. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.


No electronegativity for He. Setting to NaN. This has no physical meaning, and is mainly done to avoid errors caused by the code expecting a float.



CPU times: user 595 ms, sys: 0 ns, total: 595 ms
Wall time: 591 ms


In [9]:
%%time
oqmd_data.sort_values('delta_e', ascending=True, inplace=True)
oqmd_data.drop_duplicates('pretty_comp', keep='first', inplace=True)
print('Reduced dataset to %d entries'%len(oqmd_data))

Reduced dataset to 9398 entries
CPU times: user 8.31 ms, sys: 0 ns, total: 8.31 ms
Wall time: 6.55 ms


In [10]:
oqmd_data['nelems'] = oqmd_data['comp_obj'].apply(lambda x: len(x))

In [11]:
oqmd_data = oqmd_data[np.abs(oqmd_data.delta_e - oqmd_data.delta_e.mean()) <= (5*oqmd_data.delta_e.std())]
oqmd_data.shape

(9398, 10)

In [12]:
oqmd_data.groupby('nelems').count()

Unnamed: 0_level_0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,comp_obj,pretty_comp
nelems,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,60,60,60,32,60,60,60,60,60
2,3168,3168,3168,2254,3133,3168,3168,3168,3168
3,5218,5218,5218,3668,5160,5218,5218,5218,5218
4,844,844,844,445,839,844,844,844,844
5,95,95,95,30,95,95,95,95,95
6,11,11,11,2,11,11,11,11,11
7,2,2,2,0,2,2,2,2,2


In [13]:
oqmd_data.query('nelems > 1', inplace=True)
print (oqmd_data.groupby('nelems').count())
oqmd_data.shape

        composition  energy_pa  volume_pa  magmom_pa  bandgap  delta_e  \
nelems                                                                   
2              3168       3168       3168       2254     3133     3168   
3              5218       5218       5218       3668     5160     5218   
4               844        844        844        445      839      844   
5                95         95         95         30       95       95   
6                11         11         11          2       11       11   
7                 2          2          2          0        2        2   

        stability  comp_obj  pretty_comp  
nelems                                    
2            3168      3168         3168  
3            5218      5218         5218  
4             844       844          844  
5              95        95           95  
6              11        11           11  
7               2         2            2  


(9338, 10)

In [14]:
%%time
oqmd_data['comp_dict'] = oqmd_data['pretty_comp'].apply(lambda x: data_utils.parse_formula(x))

CPU times: user 129 ms, sys: 0 ns, total: 129 ms
Wall time: 128 ms


In [15]:
%%time
elements_present = set()
for i, row in oqmd_data.iterrows():
    elements_present |= set(row['comp_dict'].keys())
print (elements_present)

{'Lu', 'Tm', 'Sm', 'Al', 'Ti', 'Pt', 'Tl', 'Au', 'K', 'Cu', 'Rb', 'Ru', 'Ba', 'Gd', 'Zn', 'Li', 'Th', 'V', 'Mo', 'Hg', 'Cs', 'Pa', 'Pb', 'I', 'Br', 'Dy', 'Np', 'B', 'C', 'As', 'Ta', 'Ir', 'Si', 'Er', 'Tc', 'H', 'Hf', 'Sr', 'Se', 'Bi', 'Nd', 'Co', 'Ce', 'Cl', 'Mg', 'Kr', 'Cd', 'Ho', 'Re', 'Ca', 'Ga', 'Xe', 'N', 'Pm', 'Fe', 'Yb', 'Sc', 'Be', 'Ni', 'Ac', 'Os', 'Zr', 'Ag', 'Eu', 'Te', 'Sb', 'La', 'U', 'Nb', 'Sn', 'Y', 'O', 'Cr', 'In', 'F', 'Mn', 'W', 'Na', 'Pr', 'Pd', 'Pu', 'Tb', 'Ge', 'P', 'S', 'Rh'}
CPU times: user 645 ms, sys: 7.41 ms, total: 652 ms
Wall time: 649 ms


In [16]:
elements = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 
            'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb',
            'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 
            'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',
            'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa',
            'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt',
            'Ds', 'Rg', 'Cn']


elements_tl = ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K',
 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se',
 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In',
 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd',
 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au',
 'Hg', 'Tl', 'Pb', 'Bi', 'Ac','Th', 'Pa', 'U', 'Np', 'Pu']

elem_pos = dict()
i=0
for el in elements:
   elem_pos[el] = i
   i+=1

In [17]:
print (elements_present, len(elements_present))
print ([e for e in elements_present if e not in elements_tl])
print ([e for e in elements_tl if e not in elements_present])

{'Lu', 'Tm', 'Sm', 'Al', 'Ti', 'Pt', 'Tl', 'Au', 'K', 'Cu', 'Rb', 'Ru', 'Ba', 'Gd', 'Zn', 'Li', 'Th', 'V', 'Mo', 'Hg', 'Cs', 'Pa', 'Pb', 'I', 'Br', 'Dy', 'Np', 'B', 'C', 'As', 'Ta', 'Ir', 'Si', 'Er', 'Tc', 'H', 'Hf', 'Sr', 'Se', 'Bi', 'Nd', 'Co', 'Ce', 'Cl', 'Mg', 'Kr', 'Cd', 'Ho', 'Re', 'Ca', 'Ga', 'Xe', 'N', 'Pm', 'Fe', 'Yb', 'Sc', 'Be', 'Ni', 'Ac', 'Os', 'Zr', 'Ag', 'Eu', 'Te', 'Sb', 'La', 'U', 'Nb', 'Sn', 'Y', 'O', 'Cr', 'In', 'F', 'Mn', 'W', 'Na', 'Pr', 'Pd', 'Pu', 'Tb', 'Ge', 'P', 'S', 'Rh'} 86
[]
[]


In [18]:
oqmd_data['comp_fractions'] = oqmd_data['comp_dict'].apply(lambda x: data_utils.get_fractions(x))

In [19]:
print (oqmd_data[:1])
#oqmd_data['solution_term'] = oqmd_data['comp_obj'].apply(lambda x: compute_mixing_term(x))
print (oqmd_data.shape)
oqmd_data = oqmd_data[~oqmd_data['comp_fractions'].isnull()]
print (oqmd_data.shape)

     composition  energy_pa  volume_pa  magmom_pa  bandgap   delta_e  \
1561       Eu1F2  -6.166852    13.6938  -0.000006    7.938 -4.566378   

      stability comp_obj pretty_comp  nelems  \
1561  -0.924527  (Eu, F)        EuF2       2   

                                              comp_dict  \
1561  {'Eu': 0.3333333333333333, 'F': 0.666666666666...   

                                         comp_fractions  
1561  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6666667,...  
(9338, 12)
(9338, 12)


In [20]:
for i,e in enumerate(elements_tl):
    oqmd_data[e] = [ x[i] for x in oqmd_data['comp_fractions']]
oqmd_data.shape

(9338, 98)

In [21]:
oqmd_data.columns

Index(['composition', 'energy_pa', 'volume_pa', 'magmom_pa', 'bandgap',
       'delta_e', 'stability', 'comp_obj', 'pretty_comp', 'nelems',
       'comp_dict', 'comp_fractions', 'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F',
       'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V',
       'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br',
       'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag',
       'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr',
       'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
       'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi',
       'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'],
      dtype='object')

In [22]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [23]:
feature_labels = feature_calculators.feature_labels()

In [24]:
print (feature_labels, len(feature_labels))

['0-norm', '2-norm', '3-norm', '5-norm', '7-norm', '10-norm', 'minimum Number', 'maximum Number', 'range Number', 'mean Number', 'avg_dev Number', 'mode Number', 'minimum MendeleevNumber', 'maximum MendeleevNumber', 'range MendeleevNumber', 'mean MendeleevNumber', 'avg_dev MendeleevNumber', 'mode MendeleevNumber', 'minimum AtomicWeight', 'maximum AtomicWeight', 'range AtomicWeight', 'mean AtomicWeight', 'avg_dev AtomicWeight', 'mode AtomicWeight', 'minimum MeltingT', 'maximum MeltingT', 'range MeltingT', 'mean MeltingT', 'avg_dev MeltingT', 'mode MeltingT', 'minimum Column', 'maximum Column', 'range Column', 'mean Column', 'avg_dev Column', 'mode Column', 'minimum Row', 'maximum Row', 'range Row', 'mean Row', 'avg_dev Row', 'mode Row', 'minimum CovalentRadius', 'maximum CovalentRadius', 'range CovalentRadius', 'mean CovalentRadius', 'avg_dev CovalentRadius', 'mode CovalentRadius', 'minimum Electronegativity', 'maximum Electronegativity', 'range Electronegativity', 'mean Electronegativi

In [25]:
%%time
feature_calculators.featurize_dataframe(oqmd_data, col_id='comp_obj');

HBox(children=(IntProgress(value=0, description='MultipleFeaturizer', max=9338, style=ProgressStyle(descriptio…


CPU times: user 746 ms, sys: 111 ms, total: 857 ms
Wall time: 6.88 s


Unnamed: 0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,comp_obj,pretty_comp,nelems,...,mean SpaceGroupNumber,avg_dev SpaceGroupNumber,mode SpaceGroupNumber,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,compound possible,max ionic char,avg ionic char
1561,Eu1F2,-6.166852,13.69380,-0.000006,7.938,-4.566378,-0.924527,"(Eu, F)",EuF2,2,...,86.333333,95.111111,15.0,2.000000,3.333333,0.000000,2.333333,True,0.855156,0.190035
6353,F3La1,-6.772376,13.19320,-0.000004,6.277,-4.445824,0.033410,"(F, La)",LaF3,2,...,59.750000,67.125000,15.0,2.000000,3.750000,0.250000,0.000000,True,0.874268,0.163925
10949,Ba1F8Tm2,-6.442630,12.88490,0.000220,7.659,-4.394542,-0.289808,"(Ba, F, Tm)",BaTm2F8,3,...,67.000000,75.636364,15.0,2.000000,3.636364,0.000000,2.363636,True,0.908097,0.172279
10746,Ba1Er2F8,-6.457514,13.25270,-0.000011,7.716,-4.393368,-0.035464,"(Ba, Er, F)",BaEr2F8,3,...,67.000000,75.636364,15.0,2.000000,3.636364,0.000000,2.181818,True,0.908097,0.172530
7817,Ce1F3,-6.569408,12.30250,,7.853,-4.282422,0.089530,"(Ce, F)",CeF3,2,...,59.750000,67.125000,15.0,2.000000,3.750000,0.250000,0.250000,True,0.870607,0.163239
2155,F3Sc1,-6.942568,15.73650,0.000010,6.597,-4.267783,0.002051,"(F, Sc)",ScF3,2,...,59.750000,67.125000,15.0,2.000000,3.750000,0.250000,0.000000,True,0.820234,0.153794
1028,Ca1F2,-5.849962,12.95550,,7.800,-4.219407,-0.945717,"(Ca, F)",CaF2,2,...,85.000000,93.333333,15.0,2.000000,3.333333,0.000000,0.000000,True,0.891402,0.198089
1294,F2Sr1,-5.738242,15.56920,,7.434,-4.205988,-1.335164,"(F, Sr)",SrF2,2,...,85.000000,93.333333,15.0,2.000000,3.333333,0.000000,0.000000,True,0.899261,0.199836
8997,Cs2F25Th6,-6.668349,14.04940,,0.000,-4.165001,0.115750,"(Cs, F, Th)",Cs2Th6F25,3,...,66.151515,77.502296,15.0,1.939394,3.787879,0.363636,0.000000,False,0.921450,0.157873
6947,F1La1O1,-7.789479,15.49310,0.000014,4.460,-4.150927,0.007307,"(F, La, O)",LaOF,3,...,73.666667,80.222222,12.0,2.000000,3.000000,0.333333,0.000000,True,0.874268,0.187799


In [26]:
oqmd_data[feature_labels].shape

(9338, 145)

In [27]:
oqmd_data = oqmd_data[~oqmd_data[feature_labels].isnull().any(axis=1)]
print (oqmd_data.shape)

(9338, 243)


In [28]:
oqmd_data.describe()

Unnamed: 0,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,nelems,H,Li,Be,...,range SpaceGroupNumber,mean SpaceGroupNumber,avg_dev SpaceGroupNumber,mode SpaceGroupNumber,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,max ionic char,avg ionic char
count,9338.0,9338.0,6399.0,9240.0,9338.0,9338.0,9338.0,9338.0,9338.0,9338.0,...,9338.0,9338.0,9338.0,9338.0,9338.0,9338.0,9338.0,9338.0,9338.0,9338.0
mean,-5.591011,19.697866,0.180296,0.741798,-0.941423,0.027845,2.775862,0.017256,0.015645,0.003091,...,105.215464,161.940491,43.203228,131.652602,1.774062,1.393211,4.070882,2.193044,0.333931,0.064104
std,1.829821,7.159629,0.428427,1.4891,0.892836,1.742347,0.658919,0.094288,0.078174,0.040349,...,79.741196,52.839856,34.649407,86.080921,0.288188,1.120418,2.914522,3.09521,0.263486,0.054437
min,-12.578184,5.5062,-0.374343,0.0,-4.566378,-2.878117,2.0,0.0,0.0,0.0,...,0.0,6.5,0.0,2.0,0.25,0.0,0.0,0.0,0.0,0.0
25%,-6.827956,14.42155,-4.7e-05,0.0,-1.275086,-0.071158,2.0,0.0,0.0,0.0,...,31.0,117.333333,13.777778,15.0,1.6,0.5,1.428571,0.0,0.124732,0.022851
50%,-5.434765,18.76685,0.000352,0.0,-0.649462,-0.017821,3.0,0.0,0.0,0.0,...,84.0,182.4,30.24,166.0,1.916667,1.0,4.0,0.6,0.248906,0.043818
75%,-4.24057,23.721175,0.089557,0.82475,-0.360946,0.013822,3.0,0.0,0.0,0.0,...,182.0,207.0,75.326446,194.0,2.0,2.285714,6.666667,3.5,0.526794,0.099577
max,-0.474256,98.7174,4.473017,10.34,2.928806,90.617378,7.0,0.888889,0.814815,0.833333,...,227.0,229.0,113.5,229.0,2.0,5.333333,10.0,14.0,0.92145,0.230363


In [29]:
oqmd_data = oqmd_data[~oqmd_data['delta_e'].isnull()]
oqmd_data.shape

(9338, 243)

In [30]:
oqmd_data = oqmd_data[~oqmd_data['volume_pa'].isnull()]
oqmd_data.shape

(9338, 243)

In [31]:
oqmd_data = oqmd_data[~oqmd_data['bandgap'].isnull()]
oqmd_data.shape

(9240, 243)

In [32]:
properties = set(oqmd_data.columns.tolist()) - (set(feature_labels) | set(elements_present))
print (properties)

{'pretty_comp', 'comp_dict', 'delta_e', 'nelems', 'magmom_pa', 'comp_obj', 'composition', 'comp_fractions', 'energy_pa', 'stability', 'bandgap', 'volume_pa'}


In [33]:
for col in ['composition', 'comp_fractions', 'comp_dict', 'nelems', 'comp_obj']: oqmd_data.drop(col, axis=1, inplace=True)
oqmd_data.shape

(9240, 238)

In [34]:
properties = set(oqmd_data.columns.tolist()) - (set(feature_labels) | set(elements_present) |{'pretty_comp'})
print (properties)

{'delta_e', 'magmom_pa', 'energy_pa', 'stability', 'bandgap', 'volume_pa'}


In [35]:
oqmd_data = oqmd_data[['pretty_comp']+elements_tl+list(feature_labels)+list(properties)]

In [36]:
print (oqmd_data.columns)
print (oqmd_data.shape)

Index(['pretty_comp', 'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na',
       ...
       'avg f valence electrons', 'compound possible', 'max ionic char',
       'avg ionic char', 'delta_e', 'magmom_pa', 'energy_pa', 'stability',
       'bandgap', 'volume_pa'],
      dtype='object', length=238)
(9240, 238)


In [37]:
train_data, test_data = train_test_split(oqmd_data, test_size=0.1, random_state=1234567)

In [38]:
train_data.shape, test_data.shape

((8316, 238), (924, 238))

In [39]:
train_data_phys = train_data[['pretty_comp']+list(feature_labels)+list(properties)]
test_data_phys = test_data[['pretty_comp']+list(feature_labels)+list(properties)]
print (train_data_phys.shape, test_data_phys.shape)

(8316, 152) (924, 152)


In [43]:
train_data_phys.to_csv('training-data/sample_train_set.csv', index=False)
test_data_phys.to_csv('training-data/sample_test_set.csv', index=False)