In [2]:
import random
import os
import shutil
import pandas as pd
from tqdm import tqdm
import csv
import numpy as np
import math
from jarvis.ai.pkgs.utils import regr_scores
from jarvis.db.figshare import data, get_request_data
from jarvis.core.atoms import Atoms
import zipfile
import json
import time
tqdm.pandas()

In [14]:
def to_unitcell(structure):
    '''
    Make sure coordinates are within the unit cell.
    Used before using structural featurizer.

    Parameters
    ----------
    structure :  pymatgen.core.structure.Structure

    Returns
    -------
    structure :  pymatgen.core.structure.Structure
    '''    
    [site.to_unit_cell(in_place=True) for site in structure.sites]
    return structure
def Featurizer(
        df,
        col_id='structure',
        ignore_errors=True,
        chunksize=20
        ):
    """
    Featurize a dataframe using Matminter featurizers

    Parameters
    ----------
    df : Pandas.DataFrame 
        DataFrame with a column named "structure"

    Returns
    -------
    A DataFrame containing labels as the first columns and features as the rest 

    """
    # For featurization
    from matminer.featurizers.base import MultipleFeaturizer
    from matminer.featurizers.conversions import StrToComposition
    from matminer.featurizers.composition import (ElementProperty, 
                                                  Stoichiometry, 
                                                  ValenceOrbital, 
                                                  IonProperty)
    from matminer.featurizers.structure import (SiteStatsFingerprint, 
                                                StructuralHeterogeneity,
                                                ChemicalOrdering, 
                                                StructureComposition, 
                                                MaximumPackingEfficiency)   
    # Make sure df is a DataFrame
    if isinstance(df, pd.Series):
        df = df.to_frame()   
    # Use composition featurizers if inputs are compositions, otherwise use
    # both composition and structure featurizers
    if col_id != 'structure':
        # convert string to composition 
        a = StrToComposition()
        a._overwrite_data = True
        df[col_id] = a.featurize_dataframe(df,col_id,pbar=False)['composition']
        # no structural features
        struc_feat = []
        # 145 compositional features
        compo_feat = [
            Stoichiometry(),
            ElementProperty.from_preset("magpie"),
            ValenceOrbital(props=['frac']),
            IonProperty(fast=True)
            ]
    else:
        # Ensure sites are within unit cells
        df[col_id] = df[col_id].apply(to_unitcell)
        # 128 structural feature
        struc_feat = [
            SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), 
            SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
            StructuralHeterogeneity(),
            MaximumPackingEfficiency(),
            ChemicalOrdering()
            ]       
        # 145 compositional features
        compo_feat = [
            StructureComposition(Stoichiometry()),
            StructureComposition(ElementProperty.from_preset("magpie")),
            StructureComposition(ValenceOrbital(props=['frac'])),
            StructureComposition(IonProperty(fast=True))
            ]
    # Define the featurizer
    featurizer = MultipleFeaturizer(struc_feat+compo_feat)    
    # Set the chunksize used for Pool.map parallelisation
    featurizer.set_chunksize(chunksize=chunksize)
    X = featurizer.featurize_dataframe(df,col_id,ignore_errors=ignore_errors)  
    # check failed entries    
    failed = np.any(pd.isnull(X.iloc[:,df.shape[1]:]), axis=1)
    if np.sum(failed) > 0:
        print(f'Number failed: {np.sum(failed)}/{len(failed)}')
    print('Featurization completed.')
    return X, failed


In [8]:
db='dft_3d'
dat = data(db)
n_features = 273
col_id = 'structure'
df = pd.DataFrame(dat)


Obtaining 3D dataset 76k ...
Reference:https://www.nature.com/articles/s41524-020-00440-1
Other versions:https://doi.org/10.6084/m9.figshare.6815699
Loading the zipfile...
Loading completed.


In [10]:
df.head()
df_feature = pd.read_csv('X_dft_3d.csv', index_col=0)
df_missing = df[~df.jid.isin(df_feature.jid)]

  df_feature = pd.read_csv('X_dft_3d.csv', index_col=0)


In [19]:
df_missing["structure"] = df_missing["atoms"].progress_apply(
    lambda x: Atoms.from_dict(x).pymatgen_converter()
)
X_file_all = 'X_dft_3d_all.csv'
X_all, failed = Featurizer(df,col_id=col_id)
X_all.to_csv(X_file_all)

100%|██████████| 20281/20281 [00:03<00:00, 6386.94it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_missing["structure"] = df_missing["atoms"].progress_apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_id] = df[col_id].apply(to_unitcell)
MultipleFeaturizer: 100%|██████████| 20281/20281 [12:14<00:00, 27.60it/s]


Number failed: 1724/20281
Featurization completed.




In [21]:
X

Unnamed: 0,jid,spg_number,spg_symbol,formula,formation_energy_peratom,func,optb88vdw_bandgap,atoms,slme,magmom_oszicar,...,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,frac s valence electrons,frac p valence electrons,frac d valence electrons,frac f valence electrons,compound possible,max ionic char,avg ionic char
759,JVASP-1759,194,P6_3/mmc,N,5.28308,OptB88vdW,2.340,"{'lattice_mat': [[1.8200645909286142, -3.15244...",na,6.0,...,194.0,0.00,194.0,0.400000,0.600000,0.000000,0.000000,True,0.000000,0.000000
20291,JVASP-21212,225,Fm-3m,N,5.28329,OptB88vdW,2.557,"{'lattice_mat': [[5.1420260220398735, -0.0, -0...",na,na,...,194.0,0.00,194.0,0.400000,0.600000,0.000000,0.000000,True,0.000000,0.000000
34444,JVASP-25167,194,P6_3/mmc,He,0.00000,OptB88vdW,17.733,"{'lattice_mat': [[2.9095085625428467, 3.735584...",0.0,0.0,...,225.0,0.00,225.0,1.000000,0.000000,0.000000,0.000000,True,0.000000,0.000000
34574,JVASP-25278,225,Fm-3m,He,0.00472,OptB88vdW,17.968,"{'lattice_mat': [[2.3721339131314583, -9.69183...",0.0,0.0,...,225.0,0.00,225.0,1.000000,0.000000,0.000000,0.000000,True,0.000000,0.000000
34610,JVASP-801,229,Im-3m,He,0.00012,OptB88vdW,17.835,"{'lattice_mat': [[2.6234209456068904, 1.295777...",0.0,0.0,...,225.0,0.00,225.0,1.000000,0.000000,0.000000,0.000000,True,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75988,JVASP-156020,139,I4/mmm,AcRh2Pb2,-0.30652,OptB88vdW,0.000,"{'lattice_mat': [[-2.374509402119681, 2.374509...",na,0.0,...,225.0,0.00,225.0,0.103896,0.051948,0.480519,0.363636,False,0.314924,0.048812
75989,JVASP-156398,216,F-43m,PrTlZn,-0.34112,OptB88vdW,0.000,"{'lattice_mat': [[-0.0, 3.4210598347774503, 3....",na,0.0,...,194.0,0.00,194.0,0.136364,0.022727,0.454545,0.386364,False,0.065366,0.013761
75990,JVASP-156099,139,I4/mmm,BaIn2Bi2,-0.39352,OptB88vdW,0.000,"{'lattice_mat': [[4.082347574975881, -4.076131...",na,0.0,...,106.2,75.36,12.0,0.116279,0.093023,0.465116,0.325581,False,0.273288,0.038523
75991,JVASP-156007,139,I4/mmm,TmSi2Tc2,-0.54853,OptB88vdW,0.000,"{'lattice_mat': [[2.90400678672412, -2.9037689...",na,0.0,...,207.2,15.84,194.0,0.270270,0.108108,0.270270,0.351351,False,0.100238,0.016038


In [30]:
col_list =  [col for col in X.columns if col not in df_feature.columns]
col_list

['Tc_supercon']

In [3]:
pd.read_csv('X_dft_3d_all.csv', index_col=0)

  pd.read_csv('X_dft_3d_all.csv', index_col=0)


Unnamed: 0,jid,spg_number,spg_symbol,formula,formation_energy_peratom,func,optb88vdw_bandgap,atoms,slme,magmom_oszicar,...,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,frac s valence electrons,frac p valence electrons,frac d valence electrons,frac f valence electrons,compound possible,max ionic char,avg ionic char
0,JVASP-90856,129,P4/nmm,TiCuSiAs,-0.42762,OptB88vdW,0.000,"{'lattice_mat': [[3.566933224304235, 0.0, -0.0...",na,0.0,...,203.000000,23.000000,166.0,0.205882,0.147059,0.647059,0.000000,False,0.097332,0.012494
1,JVASP-86097,221,Pm-3m,DyB6,-0.41596,OptB88vdW,0.000,"{'lattice_mat': [[4.089078911208881, 0.0, 0.0]...",na,0.0,...,170.000000,6.857143,166.0,0.466667,0.200000,0.000000,0.333333,False,0.154731,0.018947
2,JVASP-64906,119,I-4m2,Be2OsRu,0.04847,OptB88vdW,0.000,"{'lattice_mat': [[-1.833590720595598, 1.833590...",na,0.0,...,194.000000,0.000000,194.0,0.205882,0.000000,0.382353,0.411765,False,0.094461,0.023615
3,JVASP-98225,14,P2_1/c,KBi,-0.44140,OptB88vdW,0.472,"{'lattice_mat': [[7.2963518353359165, 0.0, 0.0...",na,0.0,...,120.500000,108.500000,12.0,0.100000,0.100000,0.333333,0.466667,False,0.302324,0.075581
4,JVASP-10,164,P-3m1,VSe2,-0.71026,OptB88vdW,0.000,"{'lattice_mat': [[1.6777483798834445, -2.90594...",na,0.0,...,85.666667,95.555556,14.0,0.162162,0.216216,0.621622,0.000000,False,0.190712,0.042380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75988,JVASP-156020,139,I4/mmm,AcRh2Pb2,-0.30652,OptB88vdW,0.000,"{'lattice_mat': [[-2.374509402119681, 2.374509...",na,0.0,...,225.000000,0.000000,225.0,0.103896,0.051948,0.480519,0.363636,False,0.314924,0.048812
75989,JVASP-156398,216,F-43m,PrTlZn,-0.34112,OptB88vdW,0.000,"{'lattice_mat': [[-0.0, 3.4210598347774503, 3....",na,0.0,...,194.000000,0.000000,194.0,0.136364,0.022727,0.454545,0.386364,False,0.065366,0.013761
75990,JVASP-156099,139,I4/mmm,BaIn2Bi2,-0.39352,OptB88vdW,0.000,"{'lattice_mat': [[4.082347574975881, -4.076131...",na,0.0,...,106.200000,75.360000,12.0,0.116279,0.093023,0.465116,0.325581,False,0.273288,0.038523
75991,JVASP-156007,139,I4/mmm,TmSi2Tc2,-0.54853,OptB88vdW,0.000,"{'lattice_mat': [[2.90400678672412, -2.9037689...",na,0.0,...,207.200000,15.840000,194.0,0.270270,0.108108,0.270270,0.351351,False,0.100238,0.016038
