# JARVIS EDA

In [4]:
# Imports
import pandas as pd
from jarvis.db.figshare import data

### Checkpoint 1: Import 3D DFT Data from JARVIS and Pickel it


In [5]:
dft_data = data('dft_3d')
df = pd.DataFrame(dft_data)
df.to_pickle('jarvis_dft_3d.pkl')

Obtaining 3D dataset 76k ...
Reference:https://www.nature.com/articles/s41524-020-00440-1
Other versions:https://doi.org/10.6084/m9.figshare.6815699
Loading the zipfile...
Loading completed.


- Features for semi-conductivity
    - `optb88vdw_bandgap`, `mbj_bandgap`, and `hse_gap` -> band gaps
    - `ehull` and `formation_energy_peratom` -> thermodynamic stability
    - `effective_masses_300K`, `avg_elec_mass`, `avg_hole_mass` -> electron mobility
    - `epsx`, `epsy`, `epsz` -> dielectric properties
     

In [6]:
features = df.columns.tolist()
print(features)

['jid', 'spg_number', 'spg_symbol', 'formula', 'formation_energy_peratom', 'func', 'optb88vdw_bandgap', 'atoms', 'slme', 'magmom_oszicar', 'spillage', 'elastic_tensor', 'effective_masses_300K', 'kpoint_length_unit', 'maxdiff_mesh', 'maxdiff_bz', 'encut', 'optb88vdw_total_energy', 'epsx', 'epsy', 'epsz', 'mepsx', 'mepsy', 'mepsz', 'modes', 'magmom_outcar', 'max_efg', 'avg_elec_mass', 'avg_hole_mass', 'icsd', 'dfpt_piezo_max_eij', 'dfpt_piezo_max_dij', 'dfpt_piezo_max_dielectric', 'dfpt_piezo_max_dielectric_electronic', 'dfpt_piezo_max_dielectric_ionic', 'max_ir_mode', 'min_ir_mode', 'n-Seebeck', 'p-Seebeck', 'n-powerfact', 'p-powerfact', 'ncond', 'pcond', 'nkappa', 'pkappa', 'ehull', 'Tc_supercon', 'dimensionality', 'efg', 'xml_data_link', 'typ', 'exfoliation_energy', 'spg', 'crys', 'density', 'poisson', 'raw_files', 'nat', 'bulk_modulus_kv', 'shear_modulus_gv', 'mbj_bandgap', 'hse_gap', 'reference', 'search']


### Checkpoint 2: Filter Out Semi-Conductors and Transparent Semi-conductors, then store as CSV 


In [7]:
def has_element(atoms_dict, element='O'):
    atoms = Atoms.from_dict(atoms_dict)
    return element in atoms.elements

def filter_semiconductors(df, band_gap_min = 0.1, band_gap_method = 'optb88vdw_bandgap'):
    df[band_gap_method] = pd.to_numeric(df[band_gap_method], errors='coerce')
    df['ehull'] = pd.to_numeric(df['ehull'], errors='coerce')
    df = df.dropna(subset=[band_gap_method, 'ehull'])
    df['has_oxygen'] = df['atoms'].apply(lambda x: has_element(x, 'O'))
    semis = df[
        (df[band_gap_method] > band_gap_min) &
        (df['ehull'] <= 0.1) &
        (df['has_oxygen'] == True)
        ]
    return semis

In [9]:
import pandas as pd
import numpy as np
from jarvis.core.atoms import Atoms

# Read in pickled data
df = pd.read_pickle('jarvis_dft_3d.pkl')

# Filter Semi-Conductors Using 3 Different Bandgap Estimations
semis_vdw = filter_semiconductors(df, 0.1)
semis_mbj = filter_semiconductors(df, 0.1, "mbj_bandgap")
semis_hse = filter_semiconductors(df, 0.1, "hse_gap")

# Filter Transparent Semi-Conductors Using 3 Different Bandgap Estimations
transparent_semis_vdw = filter_semiconductors(df, 3.0)
transparent_semis_mbj = filter_semiconductors(df, 3.0, "mbj_bandgap")
transparent_semis_hse = filter_semiconductors(df, 3.0, "hse_gap")


semis_vdw.to_csv('semis_vdw.csv', index=False)
print(f"Saved {len(semis_vdw)} entries to semis_vdw.csv")
semis_mbj.to_csv('semis_vdw.mbj', index=False)
print(f"Saved {len(semis_mbj)} entries to semis_mbj.csv")
semis_hse.to_csv('semis_vdw.hse', index=False)
print(f"Saved {len(semis_hse)} entries to semis_hse.csv")

transparent_semis_vdw.to_csv('transparent_semis_vdw.csv', index=False)
print(f"Saved {len(transparent_semis_vdw)} entries to transparent_semis_vdw.csv")
transparent_semis_mbj.to_csv('transparent_semis_mbj.csv', index=False)
print(f"Saved {len(transparent_semis_mbj)} entries to transparent_semis_mbj.csv")
transparent_semis_hse.to_csv('transparent_semis_mbj.hse', index=False)
print(f"Saved {len(transparent_semis_hse)} entries to transparent_semis_hse.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['has_oxygen'] = df['atoms'].apply(lambda x: has_element(x, 'O'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['has_oxygen'] = df['atoms'].apply(lambda x: has_element(x, 'O'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['has_oxygen'] = df['atoms'].apply(lambda x: has_element(x, 'O'))
A

Saved 8520 entries to semis_vdw.csv
Saved 2828 entries to semis_mbj.csv
Saved 12 entries to semis_hse.csv
Saved 3185 entries to transparent_semis_vdw.csv
Saved 1879 entries to transparent_semis_mbj.csv
Saved 10 entries to transparent_semis_hse.csv
