# Querying Data

In [None]:
from optimade.client import OptimadeClient
import numpy as np
import pandas as pd
import mendeleev as md
from collections import Counter

`newNEl5.pkl` has the logs of the query optimizer code. We pick the entry with the best fitness value (i.e lowest std(count)/mean(count))

In [None]:
dfQry = pd.read_pickle('newNE15.pkl')
bestQry = dfQry.iloc[dfQry['Fitness Value'].argmin(), :]
print(bestQry['Filter Query'])

We then set the filter and provider list based on this entry. Also, remove Se from the list of excluded elements. This may effect the 'fitness'. However, this new query also gives acceptable results.

In [None]:
client = OptimadeClient(use_async=False)
filterQry = bestQry['Filter Query'].replace(r', "Se"', '')
print(filterQry)

In [None]:
# client.base_urls = [bestQry['Provider List'][1]]
client.count(filter=filterQry)

# Download data from providers

In [None]:
client.max_results_per_provider = 5000
respFields = ["chemical_formula_descriptive", "species_at_sites","lattice_vectors"]
resStruct = client.get(filter=filterQry.strip(), response_fields=respFields)
providerList = list(resStruct['structures'][filterQry].keys())

# Processing and density calc
A Pandas DataFrame is used to store the data obtained as JSON from the DBs. Then lattice vectors and information about species at site is used to calculate density

In [None]:
df = pd.DataFrame(columns=respFields)
df.attrs['Filter Query'] = filterQry
df.attrs['Providers List'] = client.base_urls
df.astype({'chemical_formula_descriptive':str, 'species_at_sites':object, 'lattice_vectors':object})

In [None]:
for provider in resStruct['structures'][filterQry].keys():
    for entry in resStruct['structures'][filterQry][provider]['data']:
        if type(entry['attributes']['species_at_sites'])==list:
            df = df.append({'provider':provider,
                            'id':entry['id'],
                            'chemical_formula_descriptive':entry['attributes']['chemical_formula_descriptive'],
                            'species_at_sites':entry['attributes']['species_at_sites'],
                            'lattice_vectors':np.array(entry['attributes']['lattice_vectors'])}, ignore_index=True)

Calculate density from atomic masses and lattice vectors

In [None]:
densToSIFactor = (1.6605402E-27/1E-30)
df['volume ang^3'] = df.apply(lambda row: np.linalg.det(row['lattice_vectors']), axis=1)
df['mass au'] = df.apply(lambda row: sum([eval(f'md.{elem}.atomic_weight') for elem in row['species_at_sites']]), axis=1)
# df['mass au'] = df.apply(lambda row: sum([md.element(elem).atomic_weight for elem in row['species_at_sites']]), axis=1) #Less dodgy code but much slower
df['density au/ang^3'] = df.apply(lambda row: row['volume ang^3']/row['mass au'], axis=1)
df['density kg/m^3'] = df.apply(lambda row: row['density au/ang^3']*densToSIFactor, axis=1)
df['composition'] = df.apply(lambda row: dict(pd.Series(row['species_at_sites']).value_counts(normalize=True)), axis=1)

In [None]:
allElem = set([])
for row in df['composition']:
    allElem.update(list(row.keys()))
allElem = list(allElem)

In [None]:
def compVecVal(elem, compDict):
    if elem in compDict.keys():
        return compDict[elem]
    else:
        return 0.0

# Feature generation for ML

In [None]:
df['compVec'] = df.apply(lambda row: [compVecVal(elem, row['composition']) for elem in allElem], axis=1)

In [None]:
df['provider'].value_counts()

Pickle DataFrame for use in ML notebook

In [None]:
# df.to_pickle('bestQryData.pkl')