In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from CBFV import composition
from matminer.featurizers.structure import DensityFeatures
from matminer.featurizers.conversions import StructureToComposition
from matminer.utils.io import load_dataframe_from_json
%matplotlib inline
%config InlineBackend.figure_format='retina'

PATH = os.getcwd()
data_path = os.path.join(PATH, 'data/defect_df_bg.json')

In [13]:
df = load_dataframe_from_json(data_path)

Reading file /Users/choi/Documents/Evo_prediction_ml/data/defect_df_bg.json: 0it [00:20, ?it/s]         
Decoding objects from /Users/choi/Documents/Evo_prediction_ml/data/defect_df_bg.


In [14]:
stc = StructureToComposition()
df = stc.featurize_dataframe(df, 'defect_structure_original')

StructureToComposition:   0%|          | 0/1667 [00:00<?, ?it/s]

In [15]:
#add density feature if necessary
for i in df.index:
    tmp = df['defect_structure_original'][i]
    ifor_val = DensityFeatures(desired_features=['density', 'vpa', 'packing fraction']).featurize(tmp)
    df.at[i,('density', 'vpa', 'packing fraction')] = ifor_val

In [16]:
extract_df = df[['composition', 'defect_formation_energy']]
extract_df.columns = ['formula', 'target']
clean_df = extract_df.copy()
clean_df['formula'] = extract_df['formula'].apply(lambda x: str(x).replace(" ", ""))

clean_df

Unnamed: 0,formula,target
1917,Ta64Zn96O255,6.034068
1827,Sr18W6O35,6.877819
352,Ca72Mg36W36O215,6.900110
91,Ba54Mg27B54O161,7.094550
1868,Sr48Ge48Pb48O191,4.234480
...,...,...
1638,Rb48Sb48Mo48O287,3.978585
1095,Li48Zr16O55,7.250386
1130,Li32Nb32O63,6.372364
1294,Na72Au24O71,2.971172


In [17]:
original_length = len(clean_df)
#remove_NaN value
bool_nans_formula = clean_df['formula'].isnull()
bool_nans_target = clean_df['target'].isnull()

#remove defect energy that is less than 0
bool_invalid_target = clean_df['target'] <= 0

removed = pd.concat([clean_df.loc[bool_nans_formula], 
                     clean_df.loc[bool_nans_target], 
                     clean_df.loc[bool_invalid_target]], 
                    ignore_index=True, sort=False)

clean_df = clean_df.drop(clean_df.loc[bool_nans_formula].index, axis=0)
clean_df = clean_df.drop(clean_df.loc[bool_nans_target].index, axis=0)
clean_df = clean_df.drop(clean_df.loc[bool_invalid_target].index, axis=0)

print(f'Number of removed elements: {original_length-len(clean_df)}')
print(f'Removed elements: \n', removed)

Number of removed elements: 2
Removed elements: 
          formula      target
0       Cs150O74   -1.055934
1  K108Cd108O161 -143.321646


In [18]:
X_jarvis, y_jarvis, *_ = composition.generate_features(clean_df, elem_prop='jarvis', drop_duplicates=False, extend_features=True, sum_feat=True)
X_magpie, y_magpie, *_ = composition.generate_features(clean_df, elem_prop='magpie', drop_duplicates=False, extend_features=True, sum_feat=True)
X_mat2vec, y_mat2vec, *_ = composition.generate_features(clean_df, elem_prop='mat2vec', drop_duplicates=False, extend_features=True, sum_feat=True)
X_oliynyk, y_oliynyk, *_ = composition.generate_features(clean_df, elem_prop='oliynyk', drop_duplicates=False, extend_features=True, sum_feat=True)
X_onehot, y_onehot, *_ = composition.generate_features(clean_df, elem_prop='onehot', drop_duplicates=False, extend_features=True, sum_feat=True)
X_random_200, y_random_200, *_ = composition.generate_features(clean_df, elem_prop='random_200', drop_duplicates=False, extend_features=True, sum_feat=True)

Processing Input Data: 100%|█████████████| 1665/1665 [00:00<00:00, 53477.06it/s]


	Featurizing Compositions...


Assigning Features...: 100%|█████████████| 1665/1665 [00:00<00:00, 20417.67it/s]

	Creating Pandas Objects...



Processing Input Data: 100%|█████████████| 1665/1665 [00:00<00:00, 55926.74it/s]


	Featurizing Compositions...


Assigning Features...: 100%|█████████████| 1665/1665 [00:00<00:00, 25789.61it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|█████████████| 1665/1665 [00:00<00:00, 54064.12it/s]


	Featurizing Compositions...


Assigning Features...: 100%|█████████████| 1665/1665 [00:00<00:00, 23853.09it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|█████████████| 1665/1665 [00:00<00:00, 55563.20it/s]


	Featurizing Compositions...


Assigning Features...: 100%|█████████████| 1665/1665 [00:00<00:00, 24450.38it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|█████████████| 1665/1665 [00:00<00:00, 53723.07it/s]


	Featurizing Compositions...


Assigning Features...: 100%|█████████████| 1665/1665 [00:00<00:00, 24918.08it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|█████████████| 1665/1665 [00:00<00:00, 54393.84it/s]


	Featurizing Compositions...


Assigning Features...: 100%|█████████████| 1665/1665 [00:00<00:00, 23628.48it/s]


	Creating Pandas Objects...


In [19]:
print("X_jarvis",len(X_jarvis.columns))
print("X_magpie",len(X_magpie.columns))
print("X_mat2vec",len(X_mat2vec.columns))
print("X_oliynyk" , len(X_oliynyk.columns))
print("X_onehot" , len(X_onehot.columns))
print("X_random_200" , len(X_random_200.columns))

X_jarvis 3066
X_magpie 154
X_mat2vec 1400
X_oliynyk 308
X_onehot 833
X_random_200 1400


In [20]:
out_path_jarvis = os.path.join(PATH, 'data/descriptors/jarvis.bin')
out_path_magpie = os.path.join(PATH, 'data/descriptors/magpie.bin')
out_path_mat2vec = os.path.join(PATH, 'data/descriptors/mat2vec.bin')
out_path_oliynyk = os.path.join(PATH, 'data/descriptors/oliynyk.bin')
out_path_onehot = os.path.join(PATH, 'data/descriptors/onehot.bin')
out_path_random_200 = os.path.join(PATH, 'data/descriptors/random_200.bin')

out_path = [out_path_jarvis, out_path_magpie, out_path_mat2vec, out_path_oliynyk, out_path_onehot, out_path_random_200]
x_value_raw = [(X_jarvis, y_jarvis), (X_magpie, y_magpie), (X_mat2vec, y_mat2vec), (X_oliynyk, y_oliynyk), (X_onehot, y_onehot), (X_random_200, y_random_200)]

for path, x_value in zip(out_path, x_value_raw):
    with open(path, 'wb') as f:
        pickle.dump(x_value, f)