In [6]:
import json
import pymatgen

import tensorflow as tf

from pymatgen.core import Structure
from megnet.models import MEGNetModel
from megnet.data.crystal import CrystalGraph
import pandas as pd
from pathlib import Path

In [2]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return Structure.from_dict(d)

In [18]:
df = pd.DataFrame()
final_data = []

list

In [16]:
dataset_path = Path("./data/dichalcogenides_public")
struct = {item.name.strip('.json'): read_pymatgen_dict(item) for item in (dataset_path/'structures').iterdir()}

In [21]:
final_data = []
for key in struct:
    name = key
    d1 = struct[key]
    df1 = Structure.as_dataframe(d1)
    data =[]
    data.append(name)
    for i in d1.lattice.abc:
        data.append(i)
    for i in d1.lattice.angles:
        data.append(i)
    data.append(d1.lattice.volume)
    for i in d1.lattice.angles:
        data.append(i)
    for i in range(len(df1)):
        for j in df1.columns:
            data.append(df1[j][i])
            
    final_data.append(data)


In [59]:
df = pd.DataFrame(final_data)


In [None]:
df.to_csv("./training.csv", sep = ";", encoding = 'utf8')

In [60]:
df.describe

<bound method NDFrame.describe of                           0          1          2          3     4     5     \
0     6141cf0f51c1cbd9654b8870  25.522526  25.522526  14.879004  90.0  90.0   
1     6141cf1051c1cbd9654b8872  25.522526  25.522526  14.879004  90.0  90.0   
2     6141cf11ae4fb853db2e3f14  25.522526  25.522526  14.879004  90.0  90.0   
3     6141cf11b842c2e72e2f2d48  25.522526  25.522526  14.879004  90.0  90.0   
4     6141cf11cc0e69a0cf28ab35  25.522526  25.522526  14.879004  90.0  90.0   
...                        ...        ...        ...        ...   ...   ...   
2961  6146d0b54e27a1844a5f0b02  25.522526  25.522526  14.879004  90.0  90.0   
2962  6146dd853ac25c70a5c6cdeb  25.522526  25.522526  14.879004  90.0  90.0   
2963  6146e9103ac25c70a5c6cded  25.522526  25.522526  14.879004  90.0  90.0   
2964  6146ecdb3ac25c70a5c6cdef  25.522526  25.522526  14.879004  90.0  90.0   
2965  6147d3de31cf3ef3d4a9f846  25.522526  25.522526  14.879004  90.0  90.0   

       6         

In [86]:
categorical_columns = [c for c in df.columns[1:] if df[c].dtype.name == 'object']
numerical_columns   = [c for c in df.columns if df[c].dtype.name != 'object']

## Filling "missed"* numerical data with -999999

#### *this data is not missed, in different files there is just different numbers of atoms, so we need to "tell" to our model that some elements have less atoms than others. We decided to fill null fields with -999999 value (0 would not be good to that reason)

In [87]:
for c in numerical_columns:
    df[c] = df[c].fillna(-999999)

In [88]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1345,1346,1347,1348,1349,1350,1351,1352,1353,1354
0,6141cf0f51c1cbd9654b8870,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,-999999.000000,-999999.000000,-999999.000000,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
1,6141cf1051c1cbd9654b8872,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,-999999.000000,-999999.000000,-999999.000000,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2,6141cf11ae4fb853db2e3f14,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,12.761263,20.261226,5.284635,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
3,6141cf11b842c2e72e2f2d48,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,14.356421,17.498332,5.284635,(S),0.958333,0.916667,0.355174,12.761263,20.261226,5.284635
4,6141cf11cc0e69a0cf28ab35,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,12.761263,20.261226,5.284635,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,6146d0b54e27a1844a5f0b02,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,12.761263,20.261226,5.284635,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2962,6146dd853ac25c70a5c6cdeb,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,-999999.000000,-999999.000000,-999999.000000,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2963,6146e9103ac25c70a5c6cded,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,-999999.000000,-999999.000000,-999999.000000,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2964,6146ecdb3ac25c70a5c6cdef,25.522526,25.522526,14.879004,90.0,90.0,120.0,8393.668022,90.0,90.0,...,-999999.000000,-999999.000000,-999999.000000,no,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000


In [89]:
df[1348][3]

Comp: S1

## Filling null categorical fields with "no" word (no such atom in an element)

In [90]:
df = df.fillna("no")

In [108]:
df_cat = df[categorical_columns].astype("str")
df_num = df[numerical_columns]
df_code = df[0]

In [109]:
df_cat.dtypes

11      object
18      object
25      object
32      object
39      object
         ...  
1320    object
1327    object
1334    object
1341    object
1348    object
Length: 192, dtype: object

## transforming every categorical column into some numerical with special function

In [110]:
df_cat = pd.get_dummies(df_cat)

In [111]:
df_cat

Unnamed: 0,11_Mo1,18_Mo1,25_Mo1,32_Mo1,39_Mo1,46_Mo1,53_Mo1,60_Mo1,67_Mo1,74_Mo1,...,1306_S1,1313_S1,1320_S1,1327_S1,1334_S1,1334_no,1341_S1,1341_no,1348_S1,1348_no
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,0,1,0,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,0,1,0,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,0,0,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,0,1,0
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,0,0,1
2962,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,0,1,0,1
2963,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,0,1,0,1
2964,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,0,1,0,1


In [113]:
df_final = pd.concat((df_code, df_cat, df_num), axis = 1)
df_final = pd.DataFrame(df_final)

In [114]:
df_final

Unnamed: 0,0,11_Mo1,18_Mo1,25_Mo1,32_Mo1,39_Mo1,46_Mo1,53_Mo1,60_Mo1,67_Mo1,...,1344,1345,1346,1347,1349,1350,1351,1352,1353,1354
0,6141cf0f51c1cbd9654b8870,1,1,1,1,1,1,1,1,1,...,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
1,6141cf1051c1cbd9654b8872,1,1,1,1,1,1,1,1,1,...,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2,6141cf11ae4fb853db2e3f14,1,1,1,1,1,1,1,1,1,...,0.355174,12.761263,20.261226,5.284635,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
3,6141cf11b842c2e72e2f2d48,1,1,1,1,1,1,1,1,1,...,0.355174,14.356421,17.498332,5.284635,0.958333,0.916667,0.355174,12.761263,20.261226,5.284635
4,6141cf11cc0e69a0cf28ab35,1,1,1,1,1,1,1,1,1,...,0.355174,12.761263,20.261226,5.284635,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,6146d0b54e27a1844a5f0b02,1,1,1,1,1,1,1,1,1,...,0.355174,12.761263,20.261226,5.284635,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2962,6146dd853ac25c70a5c6cdeb,1,1,1,1,1,1,1,1,1,...,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2963,6146e9103ac25c70a5c6cded,1,1,1,1,1,1,1,1,1,...,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000
2964,6146ecdb3ac25c70a5c6cdef,1,1,1,1,1,1,1,1,1,...,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000,-999999.000000


In [115]:
df_final.to_csv("./df_final.csv", sep = ";", encoding = 'utf8')