In [13]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

%matplotlib inline

In [14]:
engine = create_engine('sqlite:///cubic_perovskites.db')
with engine.connect() as conn, conn.begin():
    keyvalues = pd.read_sql_table('number_key_values', conn)
    textvalues = pd.read_sql_table('text_key_values', conn)
    keys = pd.read_sql_table('keys', conn)
    systems = pd.read_sql_table('systems',conn)

### The cell below return a dataframe for each variable

In [15]:
CB_ind = keyvalues[(keyvalues.key=='CB_ind')]
gllbsc_ind_gap = keyvalues[(keyvalues.key=='gllbsc_ind_gap')]
CB_dir = keyvalues[(keyvalues.key=='CB_dir')]
VB_dir = keyvalues[(keyvalues.key=='VB_dir')]
VB_ind = keyvalues[(keyvalues.key=='VB_ind')]
gllbsc_dir_gap = keyvalues[(keyvalues.key=='gllbsc_dir_gap')]
heat_of_formation = keyvalues[(keyvalues.key=='heat_of_formation_all')]
standard_energy = keyvalues[(keyvalues.key=='standard_energy')]


In [16]:
keyvalues

Unnamed: 0,key,value,id
0,CB_ind,5.962712e+00,1
1,gllbsc_ind_gap,0.000000e+00,1
2,heat_of_formation_all,1.200000e+00,1
3,CB_dir,5.962712e+00,1
4,gllbsc_dir_gap,0.000000e+00,1
5,standard_energy,-2.771908e+00,1
6,VB_dir,5.962712e+00,1
7,VB_ind,5.962712e+00,1
8,CB_ind,5.456778e+00,2
9,gllbsc_ind_gap,0.000000e+00,2


### The cells below build a dataframe for all the values. 
id is the ID of the perovskite

In [17]:
valuelist=[CB_dir,VB_dir,VB_ind,gllbsc_dir_gap,gllbsc_ind_gap,heat_of_formation,standard_energy]
values = CB_ind
for x in valuelist:
    values = pd.merge(values,x,how='outer',on='id')

In [18]:
values.set_index('id',inplace=True)

In [19]:
values.columns=['key','CB_ind','key','CB_dir','key','VB_dir','key','VB_ind','key','gllbsc_dir_gap','key','gllbsc_ind_gap','key','heat_of_formation','key','standard_energy']

In [20]:
values=values.drop(columns='key')

### The cells below build dataframe for A_ion, B_ion and Anion, each dataframe contains the ion's electronegativity and first ionization energy

anion's electronegativity and first ionization energy are calculated from average value of each atom

In [21]:
A_ion = textvalues[(textvalues.key=='A_ion')]
B_ion = textvalues[(textvalues.key=='B_ion')]
anion = textvalues[(textvalues.key=='anion')]

electroneg = pd.read_csv('electronegativity.csv')
electroneg.columns = ['value', 'electronegativity']

A_electroneg=pd.merge(A_ion,electroneg,how='left',on='value')
B_electroneg=pd.merge(B_ion,electroneg,how='left',on='value')

ionization=pd.read_excel('ionization energy.xlsx')

A_values=pd.merge(A_electroneg,ionization,how='left',on='value')
B_values=pd.merge(B_electroneg,ionization,how='left',on='value')

#affinity = pd.read_excel('electron affinities.xlsx')


In [22]:
A_values.columns=['key','A_ion','id','A_X','A_IE']
B_values.columns=['key','B_ion','id','B_X','B_IE']


In [23]:
cation_values=pd.merge(A_values,B_values,how='outer',on='id')


In [24]:
cation_values.head()

Unnamed: 0,key_x,A_ion,id,A_X,A_IE,key_y,B_ion,B_X,B_IE
0,A_ion,Ti,1,1.54,6.8281,B_ion,As,2.18,9.7886
1,A_ion,K,2,0.82,4.34066,B_ion,Sb,2.05,8.6084
2,A_ion,Hg,3,2.0,10.4375,B_ion,Nb,1.6,6.75885
3,A_ion,Bi,4,2.02,7.2856,B_ion,Sb,2.05,8.6084
4,A_ion,Na,5,0.93,5.13908,B_ion,Ag,1.93,7.5762


In [25]:
anion_electronegativity=[3.04,3.62,3.30666,3.15333,3.44,3.48666,3.17333]
#this is calculated from the average of atom's electronegativity
anion_ionization=[14.53414,14.88631,13.92342,12.53204,13.61806,15.19167,14.22878]
#this is calculated from the average of atom's ionization
anion_data = pd.DataFrame({'value':['N3','O2F','O2N','O2S','O3','OFN','ON2'],'electronegativity':anion_electronegativity,'first ionization energy':anion_ionization})
anion_data

Unnamed: 0,electronegativity,first ionization energy,value
0,3.04,14.53414,N3
1,3.62,14.88631,O2F
2,3.30666,13.92342,O2N
3,3.15333,12.53204,O2S
4,3.44,13.61806,O3
5,3.48666,15.19167,OFN
6,3.17333,14.22878,ON2


In [26]:
anion_values=pd.merge(anion,anion_data,how='left',on='value')

In [27]:
anion_values.columns=['key','anion','id','anion_X','anion_IE']

In [28]:
perovskite_values=pd.merge(anion_values,cation_values,how='outer',on='id')

### The cell below get two dataframe: mass and volume
id is the ID of perovskite

In [29]:
volume = pd.DataFrame({'id': systems.id, 'volume':systems.volume})
mass = pd.DataFrame({'id': systems.id, 'mass':systems.mass})

In [30]:
volume.head()

Unnamed: 0,id,volume
0,1,60.301946
1,2,69.072728
2,3,70.420772
3,4,71.053021
4,5,73.66862


In [31]:
perovskite=pd.merge(perovskite_values,volume,how='left',on='id')
perovskite=pd.merge(perovskite,mass,how='left',on='id')

In [32]:
perovskite=perovskite.drop(columns=['key','key_x','key_y'])

In [33]:
perovskite.set_index('id',inplace=True)

In [34]:
perovskite['density']=perovskite['mass']/perovskite['volume']

In above dataframe, X represents electronegativity, IE represents Ionization Energy.
Density is calculated from mass/volume

In [35]:
perovskite.head()

Unnamed: 0_level_0,anion,anion_X,anion_IE,A_ion,A_X,A_IE,B_ion,B_X,B_IE,volume,mass,density
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,O2N,3.30666,13.92342,Ti,1.54,6.8281,As,2.18,9.7886,60.301946,168.8071,2.799364
2,O2N,3.30666,13.92342,K,0.82,4.34066,Sb,2.05,8.6084,69.072728,206.8608,2.994826
3,O2N,3.30666,13.92342,Hg,2.0,10.4375,Nb,1.6,6.75885,70.420772,339.5019,4.821048
4,O2N,3.30666,13.92342,Bi,2.02,7.2856,Sb,2.05,8.6084,71.053021,376.7429,5.302278
5,O2N,3.30666,13.92342,Na,0.93,5.13908,Ag,1.93,7.5762,73.66862,176.86327,2.400795


In [36]:
# remove all the NaN data
values = values[(values.CB_ind > 0.0)]

In [37]:
values.head()

Unnamed: 0_level_0,CB_ind,CB_dir,VB_dir,VB_ind,gllbsc_dir_gap,gllbsc_ind_gap,heat_of_formation,standard_energy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5.962712,5.962712,5.962712,5.962712,0.0,0.0,1.2,-2.771908
2,5.456778,5.456778,5.456778,5.456778,0.0,0.0,0.6,-2.01526
3,5.603095,5.333095,6.913095,6.643095,1.6,1.0,0.54,-3.70688
4,6.066547,6.066547,6.066547,6.066547,0.0,0.0,0.88,-0.498542
5,5.539498,5.539498,5.539498,5.539498,0.0,0.0,1.78,3.552433


In [38]:
values[(values.gllbsc_dir_gap > 0.0)].count()

CB_ind               735
CB_dir               735
VB_dir               735
VB_ind               735
gllbsc_dir_gap       735
gllbsc_ind_gap       735
heat_of_formation    735
standard_energy      735
dtype: int64

only 735 lines positive bandgap data

In [39]:
data_total=pd.merge(perovskite,values,how='outer',left_index=True,right_index=True)

In [40]:
data_total.head()

Unnamed: 0_level_0,anion,anion_X,anion_IE,A_ion,A_X,A_IE,B_ion,B_X,B_IE,volume,mass,density,CB_ind,CB_dir,VB_dir,VB_ind,gllbsc_dir_gap,gllbsc_ind_gap,heat_of_formation,standard_energy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,O2N,3.30666,13.92342,Ti,1.54,6.8281,As,2.18,9.7886,60.301946,168.8071,2.799364,5.962712,5.962712,5.962712,5.962712,0.0,0.0,1.2,-2.771908
2,O2N,3.30666,13.92342,K,0.82,4.34066,Sb,2.05,8.6084,69.072728,206.8608,2.994826,5.456778,5.456778,5.456778,5.456778,0.0,0.0,0.6,-2.01526
3,O2N,3.30666,13.92342,Hg,2.0,10.4375,Nb,1.6,6.75885,70.420772,339.5019,4.821048,5.603095,5.333095,6.913095,6.643095,1.6,1.0,0.54,-3.70688
4,O2N,3.30666,13.92342,Bi,2.02,7.2856,Sb,2.05,8.6084,71.053021,376.7429,5.302278,6.066547,6.066547,6.066547,6.066547,0.0,0.0,0.88,-0.498542
5,O2N,3.30666,13.92342,Na,0.93,5.13908,Ag,1.93,7.5762,73.66862,176.86327,2.400795,5.539498,5.539498,5.539498,5.539498,0.0,0.0,1.78,3.552433


the dataframe 'data_total' is a collection of all the data



### Neural networks

In [41]:
traindata,testdata = train_test_split(data_total)

In [42]:
traindata.head()

Unnamed: 0_level_0,anion,anion_X,anion_IE,A_ion,A_X,A_IE,B_ion,B_X,B_IE,volume,mass,density,CB_ind,CB_dir,VB_dir,VB_ind,gllbsc_dir_gap,gllbsc_ind_gap,heat_of_formation,standard_energy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
17281,N3,3.04,14.53414,Y,1.22,6.2171,Al,1.61,5.98577,55.522209,157.90754,2.844043,5.235761,5.235761,5.235761,5.235761,0.0,0.0,0.98,-0.497114
3689,OFN,3.48666,15.19167,Cr,1.66,6.7665,In,1.78,5.78636,78.155933,215.8205,2.761409,5.801455,5.801455,5.801455,5.801455,0.0,0.0,1.28,-0.209755
16679,N3,3.04,14.53414,Zr,1.33,6.6339,Cs,0.79,3.8939,79.449568,266.1495,3.349918,4.980005,4.980005,4.980005,4.980005,0.0,0.0,3.84,13.092023
13676,O3,3.44,13.61806,Ni,1.91,7.6398,V,1.63,6.7462,53.086269,157.6331,2.969376,5.834828,5.834828,5.834828,5.834828,0.0,0.0,0.72,-4.466964
11260,O2F,3.62,14.88631,Hf,1.3,6.82507,Si,1.9,8.15169,68.32269,257.5727,3.769944,6.255765,6.255765,6.255765,6.255765,0.0,0.0,1.48,-6.551044
