# Using scikit-learn for materials science and engineering

content found at: https://github.com/kaaiian/UCSB_sklearn_workshop

# Parse band gap data

In [1]:
# read in data with pandas
import pandas as pd
# use numpy for vector and matrix operations
import numpy as np
# composition is a custom made python file that generates composition-based feature vectors (CBFV)
import composition

In [5]:
# read in band gap data from Zhou et al. publication:
# J. Phys. Chem. Lett., 2018, 9 (7), pp 1668–1673
#DOI: 10.1021/acs.jpclett.8b00124
#Publication Date (Web): March 13, 2018
df_band_gap = pd.read_excel('band_gap_data/jz8b00124_si_002.xlsx') 

# excel sheet is edited to fix formula "GaAs0.1P0.9G1128" to "GaAs0.1P0.9"
df_band_gap.replace("GaAs0.1P0.9G1128", "GaAs0.1P0.9", inplace=True)

# take the average of duplicte composition entries
df_band_gap = df_band_gap.groupby('composition').mean().reset_index()

# separate the metal and non-metal compounds
df_band_gap_non_metal = df_band_gap[df_band_gap['Eg (eV)'] > 0]
df_band_gap_metal = df_band_gap[df_band_gap['Eg (eV)'] == 0]

# randomly select train and test split
df_train = df_band_gap_non_metal.sample(frac=0.8, random_state=256)
df_test = df_band_gap_non_metal.iloc[~df_band_gap_non_metal.index.isin(df_train.index.values)]

# rename columns for use with feature generation
df_train.columns = ['formula', 'target']
df_test.columns = ['formula', 'target']

# randomly select train and test split for metals
df_metal_train = df_band_gap_metal.sample(frac=0.8, random_state=256)
df_metal_test = df_band_gap_metal.iloc[~df_band_gap_metal.index.isin(df_metal_train.index.values)]

# rename columns for use with feature generation
df_metal_train.columns = ['formula', 'target']
df_metal_test.columns = ['formula', 'target']

In [6]:
# generate features for both train and test split
X_train, y_train, formula_train = composition.generate_features(df_train)
X_test, y_test, formula_test = composition.generate_features(df_test)

# generate features for both train and test split (metals)
X_metal_train, y_metal_train, formula_metal_train = composition.generate_features(df_metal_train)
X_metal_test, y_metal_test, formula_metal_test = composition.generate_features(df_metal_test)

# put the data back together into their own dataframes
df_train = pd.concat([formula_train, X_train, y_train], axis=1)
df_test = pd.concat([formula_test, X_test, y_test], axis=1)

# put the data back together into their own dataframes (metals)
df_metal_train = pd.concat([formula_metal_train, X_metal_train, y_metal_train], axis=1)
df_metal_test = pd.concat([formula_metal_test, X_metal_test, y_metal_test], axis=1)

# save the featurized train and test data
df_train.to_csv('band_gap_data/df_train.csv', index=False)
df_test.to_csv('band_gap_data/df_test.csv', index=False)

# save the featurized train and test data (metals)
df_metal_train.to_csv('band_gap_data/df_metal_train.csv', index=False)
df_metal_test.to_csv('band_gap_data/df_metal_test.csv', index=False)