# Measure Predicted Changes in Phase Diagrams
Given a list of compounds that are predicted to be stable by Dipendra's DL model, measure changes in the phase diagrams.

In [1]:
%matplotlib inline
from pymatgen import Composition, Element
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
import itertools
import os
import re
import pandas as pd
import numpy as np

## Load in the Data
Load in the deep learning predictions. 

In [2]:
%%time
def load_DL_predictions(path):
    """Loads in the predictions from Dipendra, and renames the `delta_e` column to match the `oqmd_data`
    
    Also generates a `PDEntry` for each composition, and computes which system this entry is in
    """
    output = pd.read_csv(path, sep=' ')
    output.rename(columns={'delta_e_predicted': 'delta_e'}, inplace=True)
    output['comp_obj'] = output['composition'].apply(lambda x: Composition(x))
    return output
dl_predictions = dict([(x, load_DL_predictions(os.path.join('new-datasets', '%s_stable-0.2.data.gz'%x)))
     for x in ['binary', 'ternary', 'quaternary']
     ])

CPU times: user 8min 49s, sys: 3.39 s, total: 8min 53s
Wall time: 8min 53s


## Define Utility Operations
These will be useful for finding which compounds to evalaute

In [3]:
elem_re = re.compile('[A-Z][a-z]?')
def get_elems(s):
    return ''.join(sorted(set(elem_re.findall(s))))
assert get_elems('AlFeFe2') == 'AlFe'

In [4]:
%%time
for data in dl_predictions.values():
    data['system'] = data['composition'].apply(get_elems)

CPU times: user 31.7 s, sys: 302 ms, total: 32 s
Wall time: 32 s


## Get the Single Most-Stable Entry per System
Make the searches faster, yield a single entry per system

In [6]:
%%time
def get_most_stable(data):
    """From a dataset, get only the most-stable entry
    
    :param data: DataFrame, most stable DL predictions
    :return: DataFrame"""
    
    return data.sort_values('stability_predicted', ascending=True).drop_duplicates('system', keep='first')
dl_best = dict((k,get_most_stable(v)) for k,v in dl_predictions.items())

CPU times: user 5.27 s, sys: 362 ms, total: 5.63 s
Wall time: 5.62 s


## Get Predictions for Different Sets
This part of the notebook details picking different types of compounds 

### Defining Element Lists
Useful when coming up with search spaces later

In [7]:
noble_gases = ['He', 'Ne', 'Ar', 'Kr', 'Xe']
alkali_metals = ['Li', 'Na', 'K'] # , 'Rb', 'Cs'] - Only do the common ones
threed_tms = ['Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn']
actinides = ['Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'] # VASP only has these
lanthanides = set([Element.from_Z(x).symbol for x in range(57, 72)])
chalcogens = ['O', 'S', 'Se', 'Te']
pnictides = ['N', 'P', 'As', 'Sb']
halogens = ['F', 'Cl', 'Br', 'I']
tms = set([Element.from_Z(x).symbol for x in range(1,102) if Element.from_Z(x).is_transition_metal > 0])
metals = tms.union({'Li','Na','K'}).union({'Al', 'Ga', 'In', 'Sn', 'Pb', 'Bi'})
metals_no_ReOsIrSc = metals - {'Re', 'Os', 'Ir', 'Sc'}

Assemble a list of all elements found in our datasets

In [10]:
element_list = set()
dl_predictions['ternary']['composition'].apply(lambda x: element_list.update(elem_re.findall(x)))
print('Number of elements:', len(element_list))

Number of elements: 89


Remove noble gases, lanthanides, and actinides

In [11]:
element_list.difference_update(noble_gases)
element_list.difference_update(actinides)
element_list.difference_update(lanthanides)
print('Number of elements:', len(element_list))

Number of elements: 63


### Scanning different sets

In [12]:
def assemble_list_of_systems(order):
    """Create a DataFrame of all possible systems with a certain number of elements"""
    output = pd.DataFrame()
    output['elements'] = list(itertools.combinations(element_list, order))
    output['system'] = [''.join(sorted(s)) for s in output['elements']]
    return output
binary_systems = assemble_list_of_systems(2)
print('Generated %d binary systems'%len(binary_systems))

Generated 1953 binary systems


In [13]:
ternary_systems = assemble_list_of_systems(3)
print('Generated %d ternary systems'%len(ternary_systems))

Generated 39711 ternary systems


In [14]:
quaternary_systems = assemble_list_of_systems(4)
print('Generated %d quaternary systems'%len(quaternary_systems))

Generated 595665 quaternary systems


Get the ternary systems that contain an common Alkali metal

#### [Li,K,Na]-Containing Compounds

In [15]:
f = lambda els: any([e in ['Li', 'Na', 'K'] for e in els])

In [16]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 4070 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
20039,K0.1Sc0.2Br0.7,-1.884,-1.134187,"(K, Sc, Br)",BrKSc
22562,K0.142857Br0.714286Hf0.142857,-1.732,-1.090053,"(K, Br, Hf)",BrHfK


In [17]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 106875 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
78799,F0.5Na0.25Cd0.125Cs0.125,-4.062,-1.380258,"(F, Na, Cd, Cs)",CdCsFNa
85547,F0.555556Na0.222222Cr0.111111Pb0.111111,-4.01,-1.359743,"(F, Na, Cr, Pb)",CrFNaPb


#### [Li,K,Na]-Containing Ternaries w/o Halogen

In [18]:
%%time
f = lambda els: any([e in ['Li', 'Na', 'K'] for e in els]) and not any([e in halogens for e in els])

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


In [19]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 2285 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
57605,N0.555556K0.222222W0.222222,-0.953,-0.734049,"(N, K, W)",KNW
74602,Li0.1N0.5Ti0.4,-1.767,-0.633168,"(Li, N, Ti)",LiNTi


In [20]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 54795 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
257943,O0.444444Na0.111111Ba0.333333Pt0.111111,-3.218,-1.070698,"(O, Na, Ba, Pt)",BaNaOPt
546409,N0.444444P0.111111K0.222222W0.222222,-1.202,-0.883226,"(N, P, K, W)",KNPW


#### Chalcohalides

In [21]:
f = lambda els: any([e in chalcogens for e in els]) and any([e in halogens for e in els])

In [22]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 2890 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
15379,Sc0.25Se0.125Br0.625,-1.866,-1.239871,"(Sc, Se, Br)",BrScSe
15731,S0.1Sc0.3Br0.6,-2.004,-1.230478,"(S, Sc, Br)",BrSSc


In [23]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 94175 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
785,O0.4Cu0.2Sr0.3I0.1,-4.64,-2.570981,"(O, Cu, Sr, I)",CuIOSr
61980,O0.2Zr0.6Rh0.1I0.1,-2.906,-1.439151,"(O, Zr, Rh, I)",IORhZr


#### Oxides

In [24]:
f = lambda els: any([e is 'O' for e in els])

In [25]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 4085 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
23032,O0.111111Br0.666667Hf0.222222,-1.829,-1.082888,"(O, Br, Hf)",BrHfO
28616,O0.1Sc0.3Br0.6,-2.273,-0.999341,"(O, Sc, Br)",BrOSc


In [26]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 95565 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
785,O0.4Cu0.2Sr0.3I0.1,-4.64,-2.570981,"(O, Cu, Sr, I)",CuIOSr
61980,O0.2Zr0.6Rh0.1I0.1,-2.906,-1.439151,"(O, Zr, Rh, I)",IORhZr


#### Metal Oxides

In [27]:
f = lambda els: any([e is 'O' for e in els]) and sum([e in metals for e in els]) == (len(els) - 1)

In [28]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 1210 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
88789,O0.625K0.25Os0.125,-1.91,-0.56325,"(O, K, Os)",KOOs
90416,O0.6Ru0.2Ag0.2,-1.422,-0.556339,"(O, Ru, Ag)",AgORu


In [29]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 15775 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
676396,O0.6Al0.1V0.2Y0.1,-3.931,-0.832039,"(O, Al, V, Y)",AlOVY
963203,O0.625Ti0.125Mn0.125Sn0.125,-3.254,-0.748019,"(O, Ti, Mn, Sn)",MnOSnTi


#### $3d$ Metal Oxides

In [30]:
f = lambda els: any([e is 'O' for e in els]) and sum([e in threed_tms for e in els]) == (len(els) - 1)

In [31]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 40 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
182864,O0.6Ti0.2Cr0.2,-3.133,-0.328112,"(O, Ti, Cr)",CrOTi
211008,O0.6Ti0.2Mn0.2,-3.129,-0.291521,"(O, Ti, Mn)",MnOTi


In [32]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 20 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
8249303,O0.6Ti0.2Cr0.1Mn0.1,-3.087,-0.250896,"(O, Ti, Cr, Mn)",CrMnOTi
8320390,O0.6Sc0.1Ti0.1Cr0.2,-3.224,-0.249277,"(O, Sc, Ti, Cr)",CrOScTi


#### Intermetallics

In [33]:
f = lambda els: all([e in metals for e in els])

In [34]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 760 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
120610,Al0.555556Hf0.111111Ir0.333333,-1.231,-0.452109,"(Al, Hf, Ir)",AlHfIr
146028,Al0.5Y0.125Ir0.375,-1.226,-0.391448,"(Al, Y, Ir)",AlIrY


In [35]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 2310 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
211,Sc0.5Ni0.1Mo0.1Sn0.3,-3.517,-2.8886,"(Sc, Ni, Mo, Sn)",MoNiScSn
3449702,Al0.625Zr0.125Rh0.125Os0.125,-1.125,-0.435015,"(Al, Zr, Rh, Os)",AlOsRhZr


#### Ternary Intermetallics (No Ir/Os/Re/Sc)

In [36]:
f = lambda els: all([e in metals_no_ReOsIrSc for e in els])

In [37]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 335 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
166948,Al0.5Ti0.1Rh0.4,-1.318,-0.353031,"(Al, Ti, Rh)",AlRhTi
169556,Al0.5Zr0.1Rh0.4,-1.335,-0.348634,"(Al, Zr, Rh)",AlRhZr


In [38]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 765 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
6042185,Al0.4Mn0.1Zr0.1Pt0.4,-1.255,-0.311566,"(Al, Mn, Zr, Pt)",AlMnPtZr
6565275,Al0.4Ga0.1Hf0.1Pt0.4,-1.262,-0.294909,"(Al, Ga, Hf, Pt)",AlGaHfPt


#### Ternary Intermetallics w/ at least 1 $3d$ metals

In [39]:
f =lambda els: all([e in metals_no_ReOsIrSc for e in els]) and sum([e in threed_tms for e in els]) > 1

In [40]:
possible_systems = set(ternary_systems[ternary_systems['elements'].apply(f)]['system'])
results = dl_best['ternary'][dl_best['ternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 10 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
256303,Ti0.222222Ni0.444444In0.333333,-0.589,-0.246996,"(Ti, Ni, In)",InNiTi
317887,V0.25Mn0.625Ga0.125,-0.387,-0.203642,"(V, Mn, Ga)",GaMnV


In [41]:
possible_systems = set(quaternary_systems[quaternary_systems['elements'].apply(f)]['system'])
results = dl_best['quaternary'][dl_best['quaternary']['system'].apply(lambda x: x in possible_systems)]
print('Found %d matches. Top 2:'%results.size)
results.head(2)

Found 105 matches. Top 2:


Unnamed: 0,composition,delta_e,stability_predicted,comp_obj,system
6832845,Al0.5Ti0.1Ni0.1Rh0.3,-1.172,-0.287016,"(Al, Ti, Ni, Rh)",AlNiRhTi
7217165,Al0.6Cr0.1Co0.1Rh0.2,-0.928,-0.276352,"(Al, Cr, Co, Rh)",AlCoCrRh
