In [22]:
# read in data with pandas
import pandas as pd
# use numpy for vector and matrix operations
import numpy as np

# composition is a custom made python file that generates composition-based feature vectors (CBFV)
import composition
# utils is a custom made python file that has some useful functions
import utils

# make nice figures
import matplotlib.pyplot as plt
# machine learnign algorithms
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# preprocessing for the features
from sklearn.preprocessing import StandardScaler, Normalizer
# feature reduction
from sklearn.decomposition import PCA
# import model selection tools
from sklearn.model_selection import cross_validate, cross_val_score,\
                                    cross_val_predict, learning_curve, \
                                    GridSearchCV, KFold
# grab metrics to evaluate our models
from sklearn.metrics import confusion_matrix, classification_report,\
                            r2_score, mean_squared_error

In [23]:
df_bulk_modulus = pd.read_csv('bulk_modulus_aflow.csv')
df_bulk_modulus.head()

Unnamed: 0,ENTRY,AEL VRH bulk modulus,atomic formation enthalpy,space group,Pearson symbol,DATA
0,Al1As1 [d1acbf7cd6bbd917],67.4056,-0.487735,216 - F43m,cF8,"[API, Out, JSON ]"
1,Al1B1Be1 [7b87669d29858aed],140.778,-2.28262,216 - F43m,cF12,"[API, Out, JSON ]"
2,Al1Ba1Ge1 [e5990f43d042239c],44.3828,-0.471822,187 - P6m2,hP3,"[API, Out, JSON ]"
3,Al1Ba1Si1 [0655bb1b738565e8],49.0752,-0.383687,187 - P6m2,hP3,"[API, Out, JSON ]"
4,Al1Bi1 [786c95a74595dac5],37.8444,0.05967,216 - F43m,cF8,"[API, Out, JSON ]"


In [24]:
df_bulk_modulus.drop('DATA', inplace=True, axis=1)
df_bulk_modulus['ENTRY'] = [formula.split(' ')[0] for formula in df_bulk_modulus['ENTRY']]
df_bulk_modulus.head()

Unnamed: 0,ENTRY,AEL VRH bulk modulus,atomic formation enthalpy,space group,Pearson symbol
0,Al1As1 [d1acbf7cd6bbd917],67.4056,-0.487735,216 - F43m,cF8
1,Al1B1Be1 [7b87669d29858aed],140.778,-2.28262,216 - F43m,cF12
2,Al1Ba1Ge1 [e5990f43d042239c],44.3828,-0.471822,187 - P6m2,hP3
3,Al1Ba1Si1 [0655bb1b738565e8],49.0752,-0.383687,187 - P6m2,hP3
4,Al1Bi1 [786c95a74595dac5],37.8444,0.05967,216 - F43m,cF8


In [25]:
[formula.split(' ')[0] for formula in df_bulk_modulus['ENTRY']]

['Al1As1\xa0[d1acbf7cd6bbd917]',
 'Al1B1Be1\xa0[7b87669d29858aed]',
 'Al1Ba1Ge1\xa0[e5990f43d042239c]',
 'Al1Ba1Si1\xa0[0655bb1b738565e8]',
 'Al1Bi1\xa0[786c95a74595dac5]',
 'Al1Bi1O3\xa0[d43abd3deaeccc51]',
 'Al1C1Ho3\xa0[3ecce9ff5402ec86]',
 'Al1C1Y3\xa0[a873c04ce401162f]',
 'Al1Ca1Si1\xa0[a2be57f97c3c3734]',
 'Al1Ge1Li1\xa0[a16cd7474b3ebbab]',
 'Al1Ge1Sr1\xa0[f573f77ca303f0e2]',
 'Al1Li1Si1\xa0[c854f04be2146c1c]',
 'Al1P1\xa0[bfe6fc7189a9a3a8]',
 'Al1Sb1\xa0[8efc0f655f5d4309]',
 'Al1Si1Sr1\xa0[8023faf5b757d0b6]',
 'Al1Y1\xa0[53bc333b72a0cbd5]',
 'Al1Y3\xa0[5baa20a85d1739d4]',
 'Al1Zr3\xa0[d5adc38fa8b953cf]',
 'Al2As2\xa0[b2af4cca59eb81b2]',
 'Al2Ba1Si2\xa0[544c2df3889130ff]',
 'Al2C2Zr4\xa0[c80850c41a89bd06]',
 'Al2Ge2Y2\xa0[1854ad7843d7457d]',
 'Al2Hf2\xa0[71942a90fe720e60]',
 'Al2Hf4\xa0[dd8fe80eb93c30d7]',
 'Al2Hg1Te4\xa0[9ac7d0ab226ef604]',
 'Al2K2O4\xa0[b8b7b327da0f7bb8]',
 'Al2K2Te4\xa0[fe6e33543b6edd63]',
 'Al2Li10O8\xa0[3a9e66b3bda80de3]',
 'Al2Li2\xa0[605efbba4d8ede03]',
 '

In [26]:
df_bulk_modulus['ENTRY'] = [formula.split('\xa0')[0] for formula in df_bulk_modulus['ENTRY']]

In [27]:
df_bulk_modulus.head()

Unnamed: 0,ENTRY,AEL VRH bulk modulus,atomic formation enthalpy,space group,Pearson symbol
0,Al1As1,67.4056,-0.487735,216 - F43m,cF8
1,Al1B1Be1,140.778,-2.28262,216 - F43m,cF12
2,Al1Ba1Ge1,44.3828,-0.471822,187 - P6m2,hP3
3,Al1Ba1Si1,49.0752,-0.383687,187 - P6m2,hP3
4,Al1Bi1,37.8444,0.05967,216 - F43m,cF8


In [28]:
df_bulk_modulus.columns = ['formula', 'target', 'enthalpy', 'space_group', 'pearson_symbol']

In [29]:
df_bulk_modulus.head()

Unnamed: 0,formula,target,enthalpy,space_group,pearson_symbol
0,Al1As1,67.4056,-0.487735,216 - F43m,cF8
1,Al1B1Be1,140.778,-2.28262,216 - F43m,cF12
2,Al1Ba1Ge1,44.3828,-0.471822,187 - P6m2,hP3
3,Al1Ba1Si1,49.0752,-0.383687,187 - P6m2,hP3
4,Al1Bi1,37.8444,0.05967,216 - F43m,cF8


In [30]:
df_bulk_modulus['formula'].value_counts()

C8            6
Al8Zr4        4
O8Si4         4
Cl16Hg12O4    3
Br4Na4O12     3
C4            3
Si2           3
B1C7          3
Si4           3
O8Zr4         3
Cl1Tl1        2
Ca1Se1        2
C16           2
Mg4Si2        2
O8Si2Zr2      2
Al2Zr4        2
Ba1Se1        2
Hf2O6Sr2      2
Te2Zr2        2
Pb1Se1        2
Li4O10Si4     2
Ge8           2
Mg1Te1        2
Br1K1         2
C1Na4O4       2
Si8           2
Pb1Te1        2
C12           2
Be1O1         2
Br1Rb1        2
             ..
Cl2Hg2        1
Be2C1         1
O28P8Si4      1
Be2Na2Sb2     1
Si4Y2         1
Se4Tl4        1
Br6K2Se1      1
Pb4Y2         1
Al8Hf12       1
Hg2Mg1        1
Al8Zr12       1
Ba6C60        1
K4O12Sb4      1
Si2Zr2        1
O14Pb4Sb4     1
B8Be4C8       1
As2Na6        1
Li8O8Pb2      1
Li2Se1        1
Hg4Sr12       1
Hg3Zr1        1
Na6P2         1
Al8Ba4Si8     1
Mg1Tl1        1
Mg1Y1         1
Hf4O14Y4      1
P1Tl1         1
Na2Te1        1
Ge4Hf2        1
Br1Li1        1
Name: formula, Length: 6

In [34]:
## take the average of duplicate composition entries
df_bulk_modulus = df_bulk_modulus.groupby('formula').mean().reset_index()
df_bulk_modulus['formula'].value_counts()

O2Pb2          1
Ba2Te4         1
Ba2Bi3         1
Hf4P2Sb2       1
Si40           1
Br4Ca2         1
Sr1Te1         1
B16C16Mg8      1
Br12K4Te2      1
Hf10Si6        1
Be1Se1         1
B2P2           1
Al4Ba2         1
Al8Ba4Ge8      1
Mg2O2          1
Br1O1Rb3       1
Ge2Li4O6       1
Mg8Sr4         1
Al4Ca12Sb12    1
Sb1            1
O12Sb8         1
Ba10Sb8        1
Sb6Zr6         1
Ge6Y10         1
Sb2Sr4         1
Ge2Mg2O6       1
Mg2O6Si2       1
Ho1Te1         1
Cl1Rb1         1
Br2O8Y6        1
              ..
Al4Mg2O8       1
Hf4O14Y4       1
Hf4O12Sr4      1
P1Tl1          1
Na2Te1         1
Ge4Hf2         1
Na6P2          1
Al8Zr12        1
O14Pb4Sb4      1
Si2Zr2         1
Ba6C60         1
As4O12Sr2      1
Hg4Mg8         1
Mg4Te8         1
Na2            1
C10Si10        1
Ca4Pb4Sr4      1
Hg33K3         1
P4             1
Na6Sb2         1
Ba1O3Pb1       1
Mg3Tl3Y3       1
C60Sr6         1
As8Zr4         1
Ca4Ge4Mg4      1
O24Y16         1
Ba3Na3P3       1
O2Sr1         