# Where are the stable compositions?
Figure out which systems are the most prevalent

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.misc import comb
from pymatgen import Element, Composition
from matplotlib import pyplot as plt
import itertools
import os
import re

Important variables

In [4]:
oqmd_data = pd.read_csv('oqmd_all.txt', delim_whitespace=True).query('delta_e > -20')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
comp_per_system = {'binary':31, 'ternary':109, 'quaternary': 205}

## Compute the fraction of search space that is "stable"
Read in the "stable" compounds from disk, determine how many are from binaries/ternaries/quaternaties

In [6]:
data = dict((x,pd.read_csv(os.path.join('new-datasets', '%s_stable-0.2.data.gz'%x), sep=' ')) for x in comp_per_system)

In [7]:
data['binary'].head()

Unnamed: 0,composition,delta_e_predicted,stability_predicted
0,F0.7Ne0.3,-3.451,-3.451
1,He0.285714F0.714286,-3.448,-3.448
2,He0.333333F0.666667,-3.447,-3.447
3,F0.714286Ne0.285714,-3.434,-3.434
4,F0.666667Ar0.333333,-3.43,-3.43


Compute the fraction stable per number of components

In [8]:
for word,number in zip(['binary','ternary','quaternary'],[2,3,4]):
    number_stable = len(data[word])
    total_number = comb(89,number,exact=True)*comp_per_system[word]
    print('%s: %d stable, %d total, %.2f%%'%(word, number_stable,
                                             total_number, 
                                             number_stable / total_number * 100))

binary: 3479 stable, 121396 total, 2.87%
ternary: 324238 stable, 12378476 total, 2.62%
quaternary: 10902930 stable, 500533330 total, 2.18%


### Break it down by element

Get elements in each entry

In [9]:
elem_re = re.compile('[A-Z][a-z]?')
def get_elems(s):
    return tuple(sorted(set(elem_re.findall(s))))

In [10]:
for key,dataset in data.items():
    dataset['elements'] = dataset['composition'].apply(get_elems)

In [11]:
element_list = set()
data['binary']['composition'].apply(lambda x: element_list.update(elem_re.findall(x)))
print('Number of elements:', len(element_list))

Number of elements: 89


Which element is the most frequent

In [12]:
%%time
def get_elem_frequency(dataset):
    elem_dict = dict([
        (x,{'count':sum([x in y for y in dataset['elements']])}) for x in element_list
    ])
    return pd.DataFrame.from_dict(elem_dict, 'index')
elem_freq = dict([(name,get_elem_frequency(dataset)) for name,dataset in data.items()])

Wall time: 3min 2s


In [13]:
for item,dataset in elem_freq.items():
    print("\t",item)
    print(dataset.sort_values('count', ascending=False)[:8])

	 binary
    count
He    670
Ne    670
Ar    666
F     614
Br    276
I     250
Cl    193
Pt    180
	 ternary
    count
Ar  56134
Ne  56114
He  56046
Br  51508
I   42587
Cl  35099
F   34595
Se  34411
	 quaternary
      count
Br  2009554
Ne  1949512
Ar  1949354
He  1949227
N   1830705
Se  1786590
I   1732607
S   1518798


Most common systems

In [14]:
common_systems = {}
for item,dataset in data.items():
    print('\t', item)
    common_systems[item] = pd.DataFrame()
    common_systems[item]['counts'] = dataset['elements'].value_counts()
    print(common_systems[item][:5])

	 binary
          counts
(Ar, F)       31
(He, P)       31
(Ar, Ne)      31
(Ar, Br)      31
(Ar, He)      31
	 ternary
              counts
(Ar, He, Pt)     109
(Ar, Ne, P)      109
(Ar, Br, Cl)     109
(Ar, Cl, He)     109
(Ar, Br, He)     109
	 quaternary
                 counts
(Ar, Cl, F, I)      205
(He, Ne, O, Se)     205
(Ar, Cl, I, Se)     205
(Ar, He, O, Se)     205
(Ar, Br, Cl, O)     205


### Analyze the training set
I'm looking to understand why there are so many noble gas predictions

In [15]:
oqmd_data['elements'] = oqmd_data['comp'].apply(get_elems)
oqmd_data['composition_pmg'] = oqmd_data['comp'].apply(lambda x: Composition(x))

In [16]:
oqmd_data['nelem'] = oqmd_data['elements'].apply(lambda x: len(x))

In [17]:
def number_of_training_points(system):
    return len(set(oqmd_data[oqmd_data['elements'] == system]['composition_pmg']))
assert number_of_training_points(('Cl', 'He')) == 0

In [18]:
def training_entries_with_element(element):
    return oqmd_data[[element in x for x in oqmd_data['elements']]]

### Do we have much Noble Gas training data?

In [19]:
noble_gasses = ['He','Ne','Ar','Kr','Xe']
for ng in noble_gasses:
    print(ng, sum([number_of_training_points(tuple(sorted((ng, e)))) != 0 for e in element_list]))

He 0
Ne 0
Ar 0
Kr 1
Xe 2


*Finding*: There is no training data for He, Ne, Ar. Predictions with these elements are therefore unreliable

### Is the stability of a system related to the number of training points or mixing enthalpy?

In [24]:
# TBD

In [20]:
common_systems['binary']['training_points'] = common_systems['binary'].apply(lambda x: number_of_training_points(x.name), axis=1)

Figure out why so m

In [21]:
common_systems['binary']['max_dHf'] = \
    common_systems['binary'].apply(lambda x: oqmd_data[oqmd_data['elements'] == x.name].query('nelem == 2')['delta_e'].min(), axis=1)

*Thought*: Is just a few training entries sufficient to make it realize noble gasses are not very reactive?

## Without Rare Elements

### Get rid of the noble gasses

In [25]:
%%time
def contains_elements(my_elements, elements=noble_gasses):
    return any([x in my_elements for x in elements])
for item,dataset in data.items():
    data[item] = dataset[~ dataset['elements'].apply(contains_elements)]

Wall time: 14.3 s


Repeat the system analysis

In [29]:
%%time
common_systems = {}
for item,dataset in data.items():
    print('\t', item)
    common_systems[item] = pd.DataFrame()
    common_systems[item]['counts'] = dataset['elements'].value_counts()
    print(common_systems[item][:5])

	 binary
          counts
(F, Tm)       16
(Br, Lu)      15
(I, Pa)       15
(Br, Sc)      15
(Br, Er)      15
	 ternary
              counts
(Br, Cl, Er)      90
(Br, I, Pm)       89
(Br, Cl, Lu)      88
(Br, Cl, Ho)      88
(Br, Pm, Se)      84
	 quaternary
                 counts
(Br, I, Pm, Se)     194
(Br, I, Pm, Te)     193
(Br, I, P, Pm)      192
(Br, Cl, I, Lu)     191
(Br, Cl, Er, I)     191
Wall time: 4.5 s


*Finding*: Well, most of the top systems have rare earths

### Get rid of Actinides/Lanthanides

In [34]:
actinides = [Element.from_Z(x).symbol for x in range(57,72)]
lanthanides = [Element.from_Z(x).symbol for x in range(89, 104)]

In [36]:
for item,dataset in data.items():
    data[item] = dataset[~ dataset['elements'].apply(lambda x: np.logical_or(contains_elements(x, actinides), 
                                                                             contains_elements(x, lanthanides)))]

In [39]:
%%time
common_systems = {}
for item,dataset in data.items():
    print('\t', item)
    common_systems[item] = pd.DataFrame()
    common_systems[item]['counts'] = dataset['elements'].value_counts()
    print(common_systems[item][:10])

	 binary
          counts
(F, K)        15
(Br, Sc)      15
(Br, Hf)      15
(F, Rb)       15
(F, Na)       14
(Cs, F)       14
(Br, Y)       13
(B, I)        11
(Be, F)       10
(F, Li)       10
	 ternary
              counts
(Br, Se, Y)       78
(Br, Sc, Se)      77
(Br, I, Sc)       73
(Br, Te, Y)       73
(Br, H, Sc)       72
(Br, S, Y)        70
(Br, Sc, Te)      67
(Br, Hf, S)       67
(Br, H, Hf)       66
(Br, Hf, Se)      65
	 quaternary
                 counts
(Br, I, Sc, Se)     185
(Br, Se, Te, Y)     181
(Br, I, Te, Y)      172
(Br, H, Sc, Se)     170
(Br, Pt, Se, Y)     166
(Br, I, Se, Y)      164
(Au, Br, Se, Y)     164
(Br, Cl, Te, Y)     163
(I, S, Se, Tc)      162
(Br, P, Se, Y)      162
Wall time: 2.71 s
Parser   : 2.72 s
