import necessary libs

In [None]:
from pymatgen.core import Structure, Lattice
from pymatgen.entries.computed_entries import ComputedStructureEntry
import json 
import pandas as pd
from tqdm import tqdm
import os
from typing import str, Dict, Tuple
import shutil

Next, I process each dataset from their source file into the type I require

## Alexandria

---

In [None]:
def move_files(source_directory: str, destination_directory: str) -> None:
    """
    Moves all files from the source directory to the destination directory.
    If the destination directory does not exist, it will be created.

    :param source_directory: Path to the source directory containing the files to be moved.
    :param destination_directory: Path to the destination directory where files will be moved.
    """
    # Create the destination directory if it doesn't exist
    os.makedirs(destination_directory, exist_ok=True)

    # Walk through the source directory
    for root, dirs, files in os.walk(source_directory):
        for file in files:
            # Construct the full path of the source file
            source_file_path = os.path.join(root, file)
            # Construct the full path of the destination file
            destination_file_path = os.path.join(destination_directory, file)

            # Move the file from the source to the destination
            shutil.move(source_file_path, destination_file_path)
            print(f"Moved file: {source_file_path} -> {destination_file_path}")

source_directory = 'alexandria\\1'
destination_directory = 'alexandria\\'

move_files(source_directory, destination_directory)

In [None]:
def open_json(path: str):
    with open(path) as file:
        data = json.load(file)
    return data['entries']

In [None]:
def from_entry_to_cif_energy(data: Dict) -> Tuple[str, Dict]:
    """
    Converts a dictionary containing structure and energy data into a CIF string and energy-related data.

    :param data: A dictionary containing structure and energy information. Expected keys:
                 - 'structure': A dictionary with 'lattice' and 'sites' data.
                 - 'energy': The energy value of the structure.
                 - 'correction': Energy correction value.
                 - 'entry_id': Unique identifier for the entry.
                 - 'parameters': Additional parameters.
                 - 'composition': Composition of the structure.
                 - 'data': Additional data.
    :return: A tuple containing:
             - CIF string representation of the structure.
             - A dictionary with energy-related data.
    """
    # Extract lattice information and create a Lattice object
    lattice = Lattice(data['structure']['lattice']['matrix'])

    # Extract species and coordinates from the structure data
    species = []
    coords = []
    for site in data['structure']['sites']:
        for specie in site['species']:
            species.append(specie['element'])
            coords.append(site['xyz'])

    # Create a Structure object using the lattice, species, and coordinates
    structure = Structure(lattice, species, coords)

    # Create a ComputedStructureEntry object to store structure and energy data
    entry = ComputedStructureEntry(
        structure=structure,
        energy=data['energy'],
        correction=data['correction'],
        entry_id=data['entry_id'],
        parameters=data['parameters'],
        composition=data['composition'],
        data=data['data']
    )

    # Convert the structure to a CIF string
    cif_string = structure.to(fmt='cif')

    # Return the CIF string and the energy-related data
    return cif_string, entry.data

In [None]:
def create_dataframe(files: list, path_to_callback: str):
    cifs_for_dataframe = []
    data_for_dataframe = []
    for file in tqdm(files):
        entrys = open_json(file)
        cifs_for_callback = []
        data_for_callback = []
        
        for entry in entrys:
            cif, entry_data = from_entry_to_cif_energy(entry)
            cifs_for_callback.append(cif)
            data_for_callback.append(entry_data)
        
        callback_df = pd.concat([pd.DataFrame(data_for_callback), pd.DataFrame({"cif": cifs_for_callback})], axis=1)
        callback_df.to_csv(path_to_callback + os.path.basename(file)[:-5] + ".csv")
        
        data_for_dataframe += data_for_callback
        cifs_for_dataframe += cifs_for_callback
            
    df_entry_data = pd.DataFrame(data_for_dataframe)
    df_cif = pd.DataFrame({"cif": cifs_for_dataframe})
    
    final_df = pd.concat([df_entry_data, df_cif], axis=1)
    
    return final_df            
        

In [None]:
directory_path = 'alexandria\\jsons\\'

files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [None]:
alexandria_dataframe = create_dataframe(["\\alexandria\convex_hull_pbe.json"], "alexandria\\callbacks\\")

In [None]:
directory_path = 'alexandria\callbacks'
output_file_path = 'alexandria\\alexandria_full.csv'

csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

dataframes = []

for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)  
    df = pd.read_csv(file_path)
    dataframes.append(df)  

combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.to_csv(output_file_path, index=False)

print(f"Все файлы объединены в: {output_file_path}")

In [None]:
data = pd.read_csv("alexandria\\alexandria_full.csv")

In [None]:
len(data.formula.unique())

In [None]:
alex_pbe = pd.read_csv("alexandria/convex_hull_pbe.csv")

In [None]:
len(alex_pbe.formula.unique())

In [None]:
alex_pbe.columns

## Jarvis

In [None]:
from jarvis.db.figshare import data

dft_3d = data('dft_3d')

In [None]:
df = pd.DataFrame(dft_3d)

In [None]:
def create_structure_from_data(data):
    """
    Создает объект Structure из предоставленных данных.

    :param data: dict, содержащий данные для создания структуры
    :return: pymatgen Structure
    """
    lattice_mat = data['lattice_mat']
    coords = data['coords']
    elements = data['elements']

    lattice = Lattice(lattice_mat)

    structure = Structure(lattice=lattice, species=elements, coords=coords)

    return structure

In [None]:
atoms = df['atoms']
cifs_for_dataset = []
for atom in tqdm(atoms):
    cifs_for_dataset.append(create_structure_from_data(atom).to_file(fmt='cif'))

In [None]:
df_cif = pd.DataFrame({"cif": cifs_for_dataset})
final_df = pd.concat([df, df_cif], axis=1)
final_df.to_csv('Jarvis/jarvis.csv')

In [None]:
jarvis = pd.read_csv("Jarvis\\jarvis.csv")

In [None]:
jarvis

In [None]:
jarvis.columns

## AFlow

In [None]:
import json
from urllib.request import urlopen
from urllib.error import URLError, HTTPError

API = 'http://aflow.org/API/aflux/'
MATCHBOOK = 'species(Metals)'
DIRECTIVE = '$paging(1)'
REQUEST = API + '?' + MATCHBOOK + ',' + DIRECTIVE

try:
    with urlopen(REQUEST) as response:
        data = response.read().decode('utf-8')
        if data:
            response_json = json.loads(data)
            print(response_json)
        else:
            print("Пустой ответ от сервера")
except HTTPError as e:
    print(f"HTTP ошибка: {e.code} - {e.reason}")
except URLError as e:
    print(f"Ошибка URL: {e.reason}")
except json.JSONDecodeError as e:
    print(f"Ошибка декодирования JSON: {e}")

In [None]:
response_json

## perov_5

In [None]:
perov = pd.read_csv("perov_5\\perov_5_file.csv")
perov

## carbon_24

In [None]:
carbon = pd.read_csv("carbon_24\\carbon_24_file.csv")
carbon

In [None]:
print(carbon.iloc[0].cif)

## mp_20

In [None]:
mp = pd.read_csv("mp_20\\mp20_file.csv")
mp

In [None]:
len(set(mp.material_id) & set(jarvis.reference))

## mpts_52

In [None]:
mpts = pd.read_csv("mpts_52\\mpts_52.csv")
mpts

In [None]:
len(set(mp.material_id) | set(mpts.material_id) | set(jarvis.reference))

## Merger

let's merge all correct dataset in one big dataset with union pack of columns (material_id, cif). List of datasets: alexandria_pbe, carbon_24, Jarvis, mp_20, mpts_52, perov_5

In [None]:
jarvis = pd.read_csv("../data/Jarvis/jarvis.csv")
mpts = pd.read_csv("../data/mpts_52/mpts_52.csv")
mp = pd.read_csv("../data/mp_20/mp_20.csv")
carbon = pd.read_csv("../data/carbon_24/carbon_24.csv")
alex_pbe = pd.read_csv("../data/alexandria/alexandria_pbe.csv")
perov = pd.read_csv("../data/perov_5/perov_5.csv")

### jarvis -> jarvis_cry

In [None]:
jarvis_cry = jarvis[[ "reference", "cif"]]

In [None]:
jarvis_cry.columns = ["material_id", "cif"]

In [None]:
jarvis_cry

In Jarvis 3 type of datasets with different prefixes

- Material Project with prefix 'mp-'
- Aflow with prefix 'auid'
- Uniq Jarvis without prefix

### mpts_52 -> mpts_cry

In [None]:
mpts_cry = mpts[["material_id", "cif"]]

In [None]:
mpts_cry

В mpts есть кристалы, которые состоят из одного атома, это надо пофиксить

### mp_20 -> mp_cry

In [None]:
mp_cry = mp[['material_id', "cif"]]
mp_cry

### carbon_24 -> carbon_cry

In [None]:
carbon_cry = carbon.drop(["energy_per_atom", 'Unnamed: 0'], axis=1)

In [None]:
carbon_cry

### alexandria_pbe -> alex_pbe_cry

In [None]:
alex_pbe_cry = alex_pbe[["mat_id", "cif"]]

In [None]:
alex_pbe_cry.columns = ["material_id", "cif"]

In [None]:
alex_pbe_cry

### perov_5 -> perov_cry

In [None]:
perov_cry = perov[["material_id", "cif"]]

for understand diff between Jarvis and perov_5 datasets let's add prefix 'p-' for perov srtuctures

In [None]:
perov_cry['material_id'] = 'p-' + perov_cry['material_id'].astype(str)

In [None]:
perov_cry

### concat

In [None]:
genCry = pd.concat([mp_cry, mpts_cry, jarvis_cry, alex_pbe_cry, carbon_cry, perov_cry], ignore_index=True)

Jarvis, mp_20 and mpts_52 have crystalls from material project, so necessary to drop duplicates of this structures

In [None]:
genCry = genCry.drop_duplicates()

In [None]:
genCry = genCry.reset_index().drop("index", axis=1)
genCry

In [None]:
genCry.to_csv("genCry_dataset/genCry.csv")

## Val

In [None]:
genCry = pd.read_csv("genCry_dataset\\genCry.csv")

mpts_cry have some structures with 1-2 atoms and we need to drop it & some cifs can be broken

In [None]:
from pymatgen.core.structure import Structure
from pymatgen.io.cif import CifParser
from io import StringIO
from typing import List, Optional

def filter_structures_from_dataframe(
    dataframe: pd.DataFrame,
    cif_column: str = "cif",
    material_id_column: str = "material_id",
    num_of_atoms: int = 2,
    output_file: Optional[str] = None
) -> pd.DataFrame:
    """
    Filters structures from a DataFrame containing CIF strings based on the number of atoms.
    Returns a new DataFrame with only the structures that have more than the specified number of atoms.

    :param dataframe: Input DataFrame containing CIF strings and material IDs.
    :param cif_column: Name of the column in the DataFrame that contains the CIF strings. Default is "cif".
    :param material_id_column: Name of the column in the DataFrame that contains the material IDs. Default is "material_id".
    :param num_of_atoms: Minimum number of atoms a structure must have to be included in the result. Default is 2.
    :param output_file: Path to the file where the filtered DataFrame will be saved. If None, the file is not saved.
    :return: A DataFrame containing the filtered structures with their material IDs and CIF strings.
    """
    filtered_data = []

    for index, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
        mat_id = row[material_id_column]
        cif_string = row[cif_column]

        try:
            # Convert the CIF string into a file-like object for parsing
            cif_file_like = StringIO(cif_string)

            # Parse the CIF string into a pymatgen Structure object
            parser = CifParser(cif_file_like)
            structure = parser.get_structures()[0]

            # Check if the structure has more than the specified number of atoms
            if len(structure) > num_of_atoms:
                filtered_data.append({
                    material_id_column: mat_id,
                    cif_column: cif_string
                })
        except Exception as e:
            print(f"Error processing row {index}: {e}")

    filtered_df = pd.DataFrame(filtered_data)
    
    if output_file:
        filtered_df.to_csv(output_file, index=False)
        print(f"Filtered data saved to file: {output_file}")
    
    return filtered_df

In [None]:
genCry_filtered = filter_structures_from_dataframe(genCry)

In [None]:
genCry_filtered.to_csv("genCry_dataset\\genCry_f.csv")

In [None]:
genCry_filtered