In [1]:
%matplotlib inline
import os
import multiprocessing

import scipy
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV as CV

from pymatgen.io.vasp import Poscar
from pymatgen.io.ase import AseAtomsAdaptor

from ase.visualize import view
from sklearn.svm import SVR
#from thundersvm import SVR # much faster with GPU support

def tw(iterable, **kwargs):
    "Add a progress bar using tqdm if available"
    try:
        import tqdm
        tqdm.tqdm._instances.clear()
        # clear existing instances
    except AttributeError:
        pass
    try:
        return tqdm.tqdm_notebook(iterable, **kwargs)
    except NameError:
        return iterable  # could not import tqdm

n_cores = multiprocessing.cpu_count()

# directory of POSCARs and energy files
working_directory = '/home/jason/Downloads'
directory_path = working_directory + "/CdTe_Archive/"

structure_ext = 'poscar'  #file extension names
target_ext = 'energy'

element_list = ['Cd', 'Te']  # list of elements to consider

def process_entry(args):
    """parse files"""
    entry_name, element_list, directory_path, structure_ext, target_ext = args
    target_file = '{}/{}.{}'.format(directory_path,
                                    entry_name,
                                    target_ext)
    with open(target_file) as f:
        lines = f.read().splitlines()
    local_energies = [float(line) for line in lines]
    # This dataset has per-atom energies but we won't use them
    energy = np.sum(local_energies)
    
    structure_file = '{}/{}.{}'.format(directory_path,
                                       entry_name,
                                       structure_ext)
    structure = Poscar.from_file(structure_file).structure
    entry_data = [len(structure), energy]
    for element in element_list:
        element_percent = structure.composition.get_atomic_fraction(element)
        entry_data.append(element_percent)
    return structure, entry_data


In [2]:

file_stems = [os.path.splitext(f)[0]
              for f in os.listdir(directory_path) 
              if structure_ext in f]
# check the working directory for files matching the ext

columns = ['Size', 'Total Energy'] + element_list  # pandas header

# prepare zipped arguments for parallel parsing
zipped_args = [(f, element_list, directory_path, structure_ext, target_ext)
               for f in file_stems]
parsed_data = list(tw(multiprocessing.Pool(n_cores).imap(process_entry,
                                                         zipped_args),
                      total=len(zipped_args)))

HBox(children=(IntProgress(value=0, max=4673), HTML(value='')))




In [3]:
structure_list, table_data = zip(*parsed_data)
structures = {k: v for k, v in zip(file_stems, structure_list)}
df = pd.DataFrame(columns=columns, data=table_data, index=file_stems)

reference_energies = {}
for element in element_list:
    pure_entries = df[df[element] == 1.0]
    pure_entry_energies = pure_entries['Total Energy'].values
    pure_entry_energies /= pure_entries['Size'].values
    minimum_energy = np.min(pure_entry_energies)
    reference_energies[element] = minimum_energy

In [4]:
def get_formation_energy(entry_data, element_list, reference_energies):
    element_fractions = {element: entry_data[element] 
                         for element in element_list}
    reference_contributions = [(reference_energies[element] 
                               * element_fractions[element])
                               for element in element_list]
    formation_energy = (entry_data['Total Energy']/entry_data['Size'] 
                        - np.sum(reference_contributions))
    return formation_energy

df['Formation Energy'] = df.apply(get_formation_energy, axis=1, 
                                  args=(element_list, reference_energies))
df.head()



Unnamed: 0,Size,Total Energy,Cd,Te,Formation Energy
9395,15,-30.03749,0.2,0.8,0.013579
1422,12,-14.89527,0.833333,0.166667,0.097101
1263,8,-8.134844,1.0,0.0,0.143174
8155,7,-13.18263,0.285714,0.714286,0.041126
2082,14,-15.616162,0.928571,0.071429,0.121023


In [5]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=20191014)

In [6]:
from automatminer import MatPipe

pipe = MatPipe.from_preset("debug")

ModuleNotFoundError: No module named 'automatminer'

In [7]:
target = 'Formation Energy'
prediction_df = test_df.drop(columns=[target])
prediction_df.head()

Unnamed: 0,Size,Total Energy,Cd,Te
1199,9,-10.365953,0.888889,0.111111
5826,9,-17.93607,0.111111,0.888889
1304,14,-14.626289,1.0,0.0
6489,8,-14.43624,0.125,0.875
3553,10,-20.55541,0.1,0.9
