gamma

In [5]:
# -*- coding: utf-8 -*-
"""
Created on Tue Oct  3 22:53:01 2023

@author: hkpen
"""
import numpy as np
import json
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import hdbscan

D={}
mat_id_list= []
sg_number_list=[]
formulae_list=[]
band_gap_list=[]
nelements_list=[]
elements_list=[]
discovery_route_list=[]
formula_anonymous_list=[]
exfoliation_energy_list=[]
decomposition_energy_list=[]
crystal_list=[]
point_group_list=[]

open and read db file

In [2]:
with open('db.json') as f:
    for jsonobj in f:
        strDict=json.loads(jsonobj)
        mat_id_list.append(strDict['material_id'])
        sg_number_list.append(strDict['sg_number'])
        formulae_list.append(strDict['formula_pretty'])
        band_gap_list.append(strDict['bandgap'])
        nelements_list.append(strDict['nelements'])
        elements_list.append(strDict['elements'])
        discovery_route_list.append(strDict['discovery_process'])
        formula_anonymous_list.append(strDict['formula_anonymous'])
        if 'exfoliation_energy_per_atom' in strDict:
            exfoliation_energy_list.append(strDict['exfoliation_energy_per_atom'])
        else:
            exfoliation_energy_list.append(float("nan"))
        if 'decomposition_energy' in strDict:
            decomposition_energy_list.append(strDict['decomposition_energy'])
        else:
            decomposition_energy_list.append(float("nan"))
        crystal_list.append(strDict['spacegroup']['crystal_system'])
        point_group_list.append(strDict['spacegroup']['point_group'])

In [3]:
def calc_gamma_fingerprint(bands_dict, n_bins=32, e_range=[-4, 4]):
    band_energies = np.array(bands_dict["bands"]["1"]) - bands_dict["efermi"]
    band_energies_gamma = band_energies[:, 0]
    counts, edges = np.histogram(band_energies_gamma, bins=n_bins, range=e_range)
    density = counts / len(band_energies_gamma)
    return density

read data for each mat

In [None]:
fingerprint_length = 32
fingerprints = []

print(len(mat_id_list))

for i, name in enumerate(mat_id_list):
    
    BS_dir= '../FULL_MATPEDIA_DATA/bands/'
    filename_bands=BS_dir+name+'.json'
    if not os.path.isfile(filename_bands):
        print("No such file %s" % filename_bands)
        continue
    bands_dict=json.load(open(filename_bands))
    
    fingerprints.append(calc_gamma_fingerprint(bands_dict, n_bins=fingerprint_length))

loaded_fingerprints.shape

In [5]:
np.savetxt("gamma_fingerprints.csv", fingerprints, delimiter=",")

In [3]:
loaded_fingerprints = np.genfromtxt("gamma_fingerprints.csv", delimiter=",")
loaded_fingerprints

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.03125   ],
       [0.        , 0.        , 0.        , ..., 0.01136364, 0.        ,
        0.01136364],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.03125   ,
        0.        ],
       [0.        , 0.        , 0.0625    , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.03125   ]])

## Clustering with Hdbscan

In [14]:
clusterer = hdbscan.HDBSCAN()

In [16]:
clusterer.fit(loaded_fingerprints)

In [25]:
clusterer.labels_, np.unique(clusterer.labels_, return_counts=True)

(array([-1, -1,  2, ..., -1, -1, -1]),
 (array([-1,  0,  1,  2]), array([4031,    5,    6, 1237])))

### Trying a number of metrics

In [26]:
hdbscan.dist_metrics.METRIC_MAPPING

{'euclidean': hdbscan.dist_metrics.EuclideanDistance,
 'l2': hdbscan.dist_metrics.EuclideanDistance,
 'minkowski': hdbscan.dist_metrics.MinkowskiDistance,
 'p': hdbscan.dist_metrics.MinkowskiDistance,
 'manhattan': hdbscan.dist_metrics.ManhattanDistance,
 'cityblock': hdbscan.dist_metrics.ManhattanDistance,
 'l1': hdbscan.dist_metrics.ManhattanDistance,
 'chebyshev': hdbscan.dist_metrics.ChebyshevDistance,
 'infinity': hdbscan.dist_metrics.ChebyshevDistance,
 'seuclidean': hdbscan.dist_metrics.SEuclideanDistance,
 'mahalanobis': hdbscan.dist_metrics.MahalanobisDistance,
 'wminkowski': hdbscan.dist_metrics.WMinkowskiDistance,
 'hamming': hdbscan.dist_metrics.HammingDistance,
 'canberra': hdbscan.dist_metrics.CanberraDistance,
 'braycurtis': hdbscan.dist_metrics.BrayCurtisDistance,
 'matching': hdbscan.dist_metrics.MatchingDistance,
 'jaccard': hdbscan.dist_metrics.JaccardDistance,
 'dice': hdbscan.dist_metrics.DiceDistance,
 'kulsinski': hdbscan.dist_metrics.KulsinskiDistance,
 'rogerst

In [41]:
awkward_metrics = ["minkowski", "seuclidean", "mahalanobis", "wminkowski", "kulsinski", "haversine", "cosine", "arccos", "pyfunc"]

for metric in hdbscan.dist_metrics.METRIC_MAPPING:
    if metric in awkward_metrics:
        continue

    print(metric)
    clusterer = hdbscan.HDBSCAN(metric=metric)
    clusterer.fit(loaded_fingerprints)
    print(np.unique(clusterer.labels_, return_counts=True))
    print(" ")

euclidean
(array([-1,  0,  1,  2]), array([4031,    5,    6, 1237]))
 
l2
(array([-1,  0,  1,  2]), array([4031,    5,    6, 1237]))
 
p
(array([-1,  0,  1,  2]), array([4031,    5,    6, 1237]))
 
manhattan
(array([-1,  0,  1]), array([4645,    7,  627]))
 
cityblock
(array([-1,  0,  1]), array([4645,    7,  627]))
 
l1
(array([-1,  0,  1]), array([4645,    7,  627]))
 
chebyshev
(array([-1,  0,  1,  2,  3,  4]), array([2305,    8,   11,   10,    5, 2940]))
 
infinity
(array([-1,  0,  1,  2,  3,  4]), array([2305,    8,   11,   10,    5, 2940]))
 
hamming
(array([-1,  0,  1,  2]), array([ 470,   21, 4782,    6]))
 
canberra
(array([-1,  0,  1]), array([1273,   31, 3975]))
 
braycurtis
(array([-1,  0,  1,  2,  3,  4]), array([3798,    6,   10,    7,   12, 1446]))
 
matching
(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 4