In [1]:
import ase
import ase.io
from ase.io import xsf
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
%matplotlib inline

In [2]:
dataframe_all_structures = pd.read_pickle("DataFrame_all_structures.pkl")

In [3]:
dataframe_all_structures.head()

Unnamed: 0,uuid,structure,PMG Structure from ASE Atoms
0,866e918e-7a5f-41e3-980e-038852391b5a,"(Atom('Sr', [0.0, 0.0, 10.1860885], index=0), ...","[[ 0. 0. 10.1860885] Sr, [0. ..."
1,bd77da64-be96-4464-ba94-fb66fe9956aa,"(Atom('La', [1.77025, 3.0661629421, 2.156], in...","[[1.77025 3.06616294 2.156 ] La, [-1.77..."
2,228904a8-1830-4cb9-a3a5-b79513274dbe,"(Atom('U', [1.4934375, 2.5867096279, 0.0], ind...","[[1.4934375 2.58670963 0. ] U, [4.3881..."
3,8bc1d483-534a-4549-a1ff-3c6c8d7a063d,"(Atom('Ca', [15.8287189926, -2.844113418, 3.18...","[[15.82871899 -2.84411342 3.18790827] Ca, [32..."
4,be9b94e7-a2e9-40ad-8ea3-383806ad8d31,"(Atom('H', [2.7642575085, 2.52156326, 9.162138...","[[2.76425751 2.52156326 9.16213858] H, [2.7642..."


***Read the first 10000 structures***

In [4]:
structures = [dataframe_all_structures.loc[i]["PMG Structure from ASE Atoms"] for i in range(2500)]
gridpoints = list(range(2500))

In [5]:
features = {n:{} for n in gridpoints}
for name in gridpoints:
    features[name]["structure"]=dataframe_all_structures["PMG Structure from ASE Atoms"][name]

In [6]:
data=pd.DataFrame.from_dict(features).T
data

Unnamed: 0,structure
0,"[[ 0. 0. 10.1860885] Sr, [0. ..."
1,"[[1.77025 3.06616294 2.156 ] La, [-1.77..."
2,"[[1.4934375 2.58670963 0. ] U, [4.3881..."
3,"[[15.82871899 -2.84411342 3.18790827] Ca, [32..."
4,"[[2.76425751 2.52156326 9.16213858] H, [2.7642..."
...,...
2495,"[[2.542 2.542 2.542] Lu, [7.626 7.626 7.626] L..."
2496,"[[ 3.2263759 8.6636025 11.8091422] H, [0.7528..."
2497,"[[1.59837844 0.3167142 3.43753817] H, [ 6.954..."
2498,"[[ 9.58197646 10.46131064 4.15217169] Ni, [ 5..."


In [7]:
%matplotlib inline
from matplotlib import pyplot as plt
from matminer.datasets import load_dataset
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.composition import ElementProperty, Stoichiometry, ValenceOrbital, IonProperty
from matminer.featurizers.structure import (SiteStatsFingerprint, StructuralHeterogeneity,
                                            ChemicalOrdering, StructureComposition, MaximumPackingEfficiency)
from matminer.featurizers.conversions import DictToObject
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy import stats
from tqdm import tqdm_notebook as tqdm
import numpy as np

In [8]:
featurizer = MultipleFeaturizer([
    SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
    StructuralHeterogeneity(),
    ChemicalOrdering(),
    MaximumPackingEfficiency(),
    SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
    StructureComposition(Stoichiometry()),
    StructureComposition(ElementProperty.from_preset("magpie")),
    StructureComposition(ValenceOrbital(props=['frac'])),
    StructureComposition(IonProperty(fast=True))
])

In [9]:
dto = DictToObject(target_col_id='structure', overwrite_data=True)
data = dto.featurize_dataframe(data, 'structure')

DictToObject:   0%|          | 0/2500 [00:00<?, ?it/s]

In [None]:
%%time
X = featurizer.featurize_many(data['structure'], ignore_errors=True)

MultipleFeaturizer:   0%|          | 0/2500 [00:00<?, ?it/s]

Exception in thread Thread-8:
Traceback (most recent call last):
  File "/home/mvahdat/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/mvahdat/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mvahdat/anaconda3/lib/python3.7/multiprocessing/pool.py", line 412, in _handle_workers
    pool._maintain_pool()
  File "/home/mvahdat/anaconda3/lib/python3.7/multiprocessing/pool.py", line 248, in _maintain_pool
    self._repopulate_pool()
  File "/home/mvahdat/anaconda3/lib/python3.7/multiprocessing/pool.py", line 241, in _repopulate_pool
    w.start()
  File "/home/mvahdat/anaconda3/lib/python3.7/multiprocessing/process.py", line 112, in start
    self._popen = self._Popen(self)
  File "/home/mvahdat/anaconda3/lib/python3.7/multiprocessing/context.py", line 277, in _Popen
    return Popen(process_obj)
  File "/home/mvahdat/anaconda3/lib/python3.7/multiprocessing/popen_fork.py", l

In [None]:
df_features = pd.DataFrame(X)
df_features.head()

In [None]:
df_features.to_pickle('features_MultipleFeaturizer_First2500Structures.pkl')

In [None]:
ls