In [3]:
import ase
import ase.io
from ase.io import xsf
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
%matplotlib inline

In [2]:
dataframe_all_structures = pd.read_pickle("pickle_files/DataFrame_all_structures.pkl")

In [3]:
dataframe_all_structures.head()

Unnamed: 0,uuid,structure,PMG Structure from ASE Atoms
0,866e918e-7a5f-41e3-980e-038852391b5a,"(Atom('Sr', [0.0, 0.0, 10.1860885], index=0), ...","[[ 0. 0. 10.1860885] Sr, [0. ..."
1,bd77da64-be96-4464-ba94-fb66fe9956aa,"(Atom('La', [1.77025, 3.0661629421, 2.156], in...","[[1.77025 3.06616294 2.156 ] La, [-1.77..."
2,228904a8-1830-4cb9-a3a5-b79513274dbe,"(Atom('U', [1.4934375, 2.5867096279, 0.0], ind...","[[1.4934375 2.58670963 0. ] U, [4.3881..."
3,8bc1d483-534a-4549-a1ff-3c6c8d7a063d,"(Atom('Ca', [15.8287189926, -2.844113418, 3.18...","[[15.82871899 -2.84411342 3.18790827] Ca, [32..."
4,be9b94e7-a2e9-40ad-8ea3-383806ad8d31,"(Atom('H', [2.7642575085, 2.52156326, 9.162138...","[[2.76425751 2.52156326 9.16213858] H, [2.7642..."


In [8]:
structure = dataframe_all_structures.loc[0]["PMG Structure from ASE Atoms"]
structure.composition.num_atoms <= 40

True

***Read the first 10000 structures***

In [9]:
structures = []
gridpoints = []
j = 1
for i in range(len(dataframe_all_structures)):
    structure = dataframe_all_structures.loc[i]["PMG Structure from ASE Atoms"]
    if structure.composition.num_atoms <= 40:
        structures.append(structure)
        gridpoints.append(i)
        j += 1
    if j > 2500:
        break

In [11]:
gridpoints[-1]

7603

In [24]:
df = pd.concat( [pd.Series(gridpoints), pd.Series(structures)], axis = 1, keys = ['Gridpoint', 'Structure'])

In [26]:
df.to_pickle("pickle_files/DataFrame_all_structures_sample_2500.pkl")

In [4]:
df = pd.read_pickle("pickle_files/DataFrame_all_structures_sample_2500.pkl")
df.head()

Unnamed: 0,Gridpoint,Structure
0,0,"[[ 0. 0. 10.1860885] Sr, [0. ..."
1,1,"[[1.77025 3.06616294 2.156 ] La, [-1.77..."
2,2,"[[1.4934375 2.58670963 0. ] U, [4.3881..."
3,5,"[[0. 0. 3.7015] Y, [2.654 2.654 0. ]..."
4,15,"[[4.67289568 0.7161217 1.98164056] Ba, [ 2.97..."


In [8]:
features = {n:{} for n in list(df.index)}
for name in list(df.index):
    features[name]["Structure"]=df["Structure"][name]

In [9]:
%matplotlib inline
from matplotlib import pyplot as plt
from matminer.datasets import load_dataset
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.composition import ElementProperty, Stoichiometry, ValenceOrbital, IonProperty
from matminer.featurizers.structure import (SiteStatsFingerprint, StructuralHeterogeneity,
                                            ChemicalOrdering, StructureComposition, MaximumPackingEfficiency)
from matminer.featurizers.conversions import DictToObject
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy import stats
from tqdm import tqdm_notebook as tqdm
import numpy as np

In [10]:
featurizer = MultipleFeaturizer([
    SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
    StructuralHeterogeneity(),
    ChemicalOrdering(),
    MaximumPackingEfficiency(),
    SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
    StructureComposition(Stoichiometry()),
    StructureComposition(ElementProperty.from_preset("magpie")),
    StructureComposition(ValenceOrbital(props=['frac'])),
    StructureComposition(IonProperty(fast=True))
])

In [11]:
dto = DictToObject(target_col_id='Structure', overwrite_data=True)
df = dto.featurize_dataframe(df, 'Structure')

DictToObject:   0%|          | 0/2500 [00:00<?, ?it/s]

In [12]:
%%time
X = featurizer.featurize_many(df['Structure'], ignore_errors=True)

MultipleFeaturizer:   0%|          | 0/2500 [00:00<?, ?it/s]

  "avoid errors caused by the code expecting a float." % self.symbol


CPU times: user 3.06 s, sys: 1.23 s, total: 4.29 s
Wall time: 47min 17s


In [13]:
df_features_2500structures = pd.DataFrame(X)
df_features_2500structures.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,263,264,265,266,267,268,269,270,271,272
0,5.994256,12.155981,6.161724,10.23845,1.484601,0.067254,1.137123,0.79748,0.009735,0.175101,...,106.4375,106.242188,12.0,0.325301,0.433735,0.240964,0.0,False,0.787757,0.159683
1,4.095819,11.162477,7.066658,9.399436,2.350721,0.132525,1.099394,0.629016,0.033573,0.211347,...,40.666667,51.111111,12.0,0.375,0.59375,0.03125,0.0,True,0.745613,0.125914
2,8.859597,12.758354,3.898757,11.147768,1.355516,0.03285,1.047858,0.921029,0.034401,0.083477,...,143.0,54.666667,63.0,0.172414,0.068966,0.655172,0.103448,False,0.183314,0.032139
3,7.848045,23.851948,16.003903,10.359994,2.998212,0.057588,1.259147,0.937673,0.01743,0.122793,...,195.333333,26.37037,166.0,0.27451,0.078431,0.647059,0.0,False,0.244896,0.022559
4,5.19057,13.948977,8.758407,8.653252,2.168457,0.160771,1.247099,0.710476,0.108134,0.585217,...,128.555556,80.493827,194.0,0.269231,0.346154,0.384615,0.0,True,0.803211,0.094046


In [14]:
df_features_2500structures.to_pickle('pickle_files/features_MultipleFeaturizer_2500Structures.pkl')