In [25]:
import pandas as pd
import numpy as np
from ase import Atoms
from ase.neighborlist import NeighborList
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import ast

# Load data
data = pd.read_csv("../Data/1_MatDX/MatDX_nomad_Ef_Ternary.csv")

# Report the total number of entries in the dataset
total_data = len(data)
print(f"Total number of entries in the dataset: {total_data}")

# Filter out rows where 'data' field is empty (i.e., contains 'data': [])
def is_valid_structure(structure_str):
    try:
        structure = ast.literal_eval(structure_str)[0]  # Safely parse as dictionary
        return bool(structure['data'])  # True if 'data' list is not empty
    except (ValueError, KeyError, IndexError):
        return False

# Count and report the number of entries with empty data
num_empty_data = len(data) - data['structure'].apply(is_valid_structure).sum()
print(f"Number of entries with empty data: {num_empty_data}")

# Apply filter to keep only rows with valid atomic data
filtered_data = data[data['structure'].apply(is_valid_structure)].reset_index(drop=True)

# # Define the elements of interest
# target_elements = {'Cu', 'Ag', 'Au'}

# # Function to check if structure contains only the target elements
# def contains_target_elements(symbols):
#     unique_elements = set(symbols)
#     return unique_elements.issubset(target_elements) and bool(unique_elements.intersection(target_elements))

# # Filter the dataset for Cu, Ag, Au, or their alloys
# filtered_data = filtered_data[filtered_data['structure'].apply(lambda s: contains_target_elements(
#     [atom['element'] for atom in ast.literal_eval(s)[0]['data']['atoms']]))].reset_index(drop=True)

# Function to compute RDF for a structure manually
def compute_rdf(positions, symbols, cell, rmax=10.0, nbins=100):
    # Create ASE Atoms object with provided lattice cell
    structure = Atoms(symbols=symbols, positions=positions, cell=cell, pbc=[True, True, True])

    # Create neighbor list with cutoff distance
    cutoffs = [rmax / 2.0] * len(structure)
    neighbor_list = NeighborList(cutoffs, self_interaction=False, bothways=True)
    neighbor_list.update(structure)

    # Calculate pairwise distances for RDF
    distances = []
    for i in range(len(structure)):
        indices, offsets = neighbor_list.get_neighbors(i)
        for j, offset in zip(indices, offsets):
            distance = structure.get_distance(i, j, mic=True)
            if distance <= rmax:
                distances.append(distance)

    # Compute RDF histogram
    hist, bin_edges = np.histogram(distances, bins=nbins, range=(0, rmax), density=True)
    return hist

# Parse atomic positions, symbols, and lattice cell from 'structure' column
rdf_data = []
for index, row in filtered_data.iterrows():
    # Extract atomic positions, elements, and lattice parameters
    structure_data = eval(row['structure'])[0]['data']
    positions = np.array([[atom['x'], atom['y'], atom['z']] for atom in structure_data['atoms']]) * 1e10  # Convert to angstroms
    symbols = [atom['element'] for atom in structure_data['atoms']]
    cell = np.array([structure_data['a'], structure_data['b'], structure_data['c']]) * 1e10  # Convert to angstroms

    # Compute RDF for the structure
    rdf_values = compute_rdf(positions, symbols, cell=cell)
    rdf_data.append(rdf_values)
    

rdf_data = np.array(rdf_data)  # Shape: (number_of_structures, nbins)

# Calculate cosine similarity matrix for RDF data
similarity_matrix = cosine_similarity(rdf_data)

# Convert cosine similarity matrix to distance matrix for DBSCAN
distance_matrix = 1 - similarity_matrix

# Replace any remaining negative values with 2 (maximum distance)
distance_matrix[distance_matrix < 0] = 2

# Apply DBSCAN with the modified distance matrix
clustering = DBSCAN(eps=0.05, min_samples=2, metric="precomputed").fit(distance_matrix)
labels = clustering.labels_

# Count and report the number of duplicate entries identified
num_duplicates = len(labels) - len(set(labels)) + (1 if -1 in labels else 0)
print(f"Number of duplicate entries identified: {num_duplicates}")

# Separate unique entries from duplicates
unique_indices = [i for i, label in enumerate(labels) if label == -1]
duplicates = [i for i, label in enumerate(labels) if label != -1]

# Filter the original data to remove duplicates
unique_data = filtered_data.iloc[unique_indices].reset_index(drop=True)

# Save the filtered data to a new CSV file
unique_data.to_csv("MatDX_Ef_ternary_filtered.csv", index=False)
print("Filtered entries with 'data': [] and duplicates removed. Unique data saved to MatDX_Ef_filtered.csv.")


Total number of entries in the dataset: 10000
Number of entries with empty data: 1916
Number of duplicate entries identified: 7585
Filtered entries with 'data': [] and duplicates removed. Unique data saved to MatDX_Ef_filtered.csv.


In [15]:
data = data[:100]

In [16]:
data

Unnamed: 0,formula,space_group,structure,id,formation_energy
0,Ni4Ta6,R-3c,"[{'data': {'a': [-6.910143e-10, 0, 0], 'b': [3...",PN_MCKVEQNTJFRUHOURCR2HCEB35KDFRETL,{'reference': {'Ni': 'https://nomad-lab.eu/pro...
1,Mn4Sb2,I4_1/amd,"[{'data': {'a': [-1.87778212e-10, 1.87604249e-...",PN_FGGHB2NFWUCYD3YZO4FWN3QVOKCTVI6K,{'reference': {'Mn': 'https://nomad-lab.eu/pro...
2,Ir2Na2,Imma,"[{'data': {'a': [-1.36776653e-10, 2.60817299e-...",PN_BAO7ZSBZZCMA7PH7QMR7WLDI72GOELND,{'reference': {'Na': 'https://nomad-lab.eu/pro...
3,MoSm,R-3m,"[{'data': {'a': [1.65820251e-10, 9.57363439999...",PN_4YR2JLVZHJLC3F2TYPTOVDBJJ52C6XIE,{'reference': {'Mo': 'https://nomad-lab.eu/pro...
4,Sn17P12,I-43m,"[{'data': {'a': [-5.563178910000001e-10, 5.563...",PN_H6P36AVAKDU244QNQGGCJ7YLC4A4F2YE,{'reference': {'P': 'https://nomad-lab.eu/prod...
...,...,...,...,...,...
95,Ag3Al,P4/mmm,"[{'data': {'a': [3.0239676300000004e-10, 0, 0]...",PN_DO2C3NJ47ORSHA2REQ3WZJ7TUA7NXHL6,{'reference': {'Ag': 'https://nomad-lab.eu/pro...
96,Cr3Sn,P4/mmm,"[{'data': {'a': [2.93442998e-10, -2.9606323099...",PN_KKKYHJI7BWXXKPIBAKLXP4SIIHYKGGDA,{'reference': {'Sn': 'https://nomad-lab.eu/pro...
97,Al2Pd2,P4/nmm,"[{'data': {'a': [3.3426268100000003e-10, 0, 0]...",PN_IRBO3ZKFSNX46WXZLM3JXKINWAGP7BTV,{'reference': {'Pd': 'https://nomad-lab.eu/pro...
98,Ge3Br,P4/mmm,"[{'data': {'a': [3.34953296e-10, 0, 0], 'b': [...",PN_JGSSLQFKN6EAPETSAPWORLCLL76YIPGE,{'reference': {'Br': 'https://nomad-lab.eu/pro...


In [None]:
def get_properties(row):

    
    data = ast.literal_eval(row['structure'])
    
    lattice_data = data[0]['data']
    
    lat_a, lat_b, lat_c = lattice_data['a'], lattice_data['b'], lattice_data['c']
    
    volume = abs(np.dot(lat_a, np.cross(lat_b, lat_c))) * 1e30
    e_f = ast.literal_eval(row['formation_energy'])['value']
    
    atoms_data = lattice_data['atoms']

    # Create ASE Atoms object
    atoms = Atoms(
        symbols=[atom["element"] for atom in atoms_data],
        positions=[[atom["x"], atom["y"], atom["z"]] for atom in atoms_data],
        cell=[lat_a, lat_b, lat_c],
        pbc=True
    )

    # print(atoms.get_positions())
    atoms.set_cell(atoms.cell * 1e10)
    atoms.set_positions(atoms.positions * 1e10)
    # print(atoms.get_positions())

    atoms.calc = calc

    sg = SpaceGroup(row['space_group']).int_number
    
    return (volume, len(lattice_data['atoms']), sg, e_f, atoms.get_potential_energy())