In [2]:
from interpret import interpret
from querry import *
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt
from DataSet import DataSet

import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# fix random seed for reproducibility
seed = 8888

# create a new DataSet object
dataset = DataSet()
dataset.get_data('../data/TE_survey_csv_repaired.csv')

dataset.clean()
dataset.drop(['Authors', 'DOI', 'Comments', 'Comments.1', 'Author of Unit Cell','Unit Cell DOI'])

# use extrapolate_400K to extrapolate more row data
dataset_2 = DataSet()
dataset_2.data = dataset.extrapolate_400K([])
dataset_2.get_info()

1450 rows and 4 columns.
Components are: 
Formula, Resist, Seebeck, T (K)


In [4]:
# make an array containing the atomic descriptors
array = [compound_short_descriptors(x) for x in dataset_2.df['Formula'].values]
ndf = pd.DataFrame.from_records(array)
ndf = ndf.join(dataset_2.df[['T (K)']])

In [5]:
# clean up the data (get rid of strings and NaNs.)
ndf.apply(pd.to_numeric)
ndf = ndf.fillna(0)
print(ndf.head())

      0     1             2       3      4       5       6     7     8      9  \
0  1.00  16.0  3.400000e-08  3810.0  200.0  1757.0  1115.0  17.0  20.0  167.0   
1  1.00  16.0  3.400000e-08  3810.0  200.0  1757.0  1115.0  17.0  20.0  167.0   
2  1.00  16.0  3.400000e-08  3810.0  200.0  1757.0  1115.0  17.0  20.0  167.0   
3  1.00  16.0  3.400000e-08  3810.0  200.0  1757.0  1115.0  17.0  20.0  167.0   
4  0.98  16.0  3.400000e-08  3810.0  200.0  1757.0  1115.0  17.0  20.0  167.0   

    ...     71   72   73   74   75   76   77   78   79   T (K)  
0   ...    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   300.0  
1   ...    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   400.0  
2   ...    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   700.0  
3   ...    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1000.0  
4   ...    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   300.0  

[5 rows x 81 columns]


In [None]:
X = ndf.values
Y = dataset_2.df[['Seebeck']].values

In [None]:
X_train_pn, X_test_pn, y_train, y_test = train_test_split(X, Y,
                                                    test_size=0.10,
                                                    random_state=seed)

# create the scaler from the training data only and keep it for later use
X_train_scaler = StandardScaler().fit(X_train_pn)
# apply the scaler transform to the training data
X_train = X_train_scaler.transform(X_train_pn)
# tranform the testing set
X_test = X_train_scaler.transform(X_test_pn)

# # create the scaler from the training data only and keep it for later use
# y_train_scaler = StandardScaler().fit(y_train_pn)
# # apply the scaler transform to the training data
# y_train = y_train_scaler.transform(y_train_pn)
# # tranform the testing set
# y_test = y_train_scaler.transform(y_test_pn)

In [None]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(81, input_dim=81, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, input_dim=100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, input_dim=100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs = 500, nb_epoch=1000, batch_size=100, verbose=2)

In [None]:
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))


NOICE!!

# Go Deeper

In [None]:
# define the model
def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=81, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs = 500, nb_epoch=1000, batch_size=100, verbose=2)

In [None]:
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Daaaang. That's pretty good!

# More Graphics

In [None]:
from matplotlib import pyplot

# create model
model = Sequential()
model.add(Dense(100, input_dim=81, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae', 'mape', 'cosine'])
# train model
history = model.fit(X_train, y_train, epochs=20, batch_size=len(X), verbose=2)
# plot metrics
pyplot.plot(history.history['mean_squared_error'])
pyplot.plot(history.history['mean_absolute_error'])
pyplot.plot(history.history['mean_absolute_percentage_error'])
pyplot.plot(history.history['cosine_proximity'])
pyplot.show()

In [None]:
def mean_squared_error(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true), axis=-1)

In [None]:
from keras import backend
 
def rmse(y_true, y_pred):
    return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))

In [None]:
# create model
model.add(Dense(100, input_dim=81, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mse', optimizer='adam', metrics=[rmse])
# train model
history = model.fit(X_train, y_train, epochs=20, batch_size=128, verbose=2)
# plot metrics
pyplot.plot(history.history['rmse'])
pyplot.show()

That is the RMSE metric.

In [None]:
np.concatenate((model.predict(X_test), y_test), axis=1)
#visual inspection on the testing sets.

In [None]:
# just for testing, at the end.
def test_data(compound, T):
    """Converts one sample to raw data for predicting thru ANN"""
    array = np.zeros(80) # create an empty array with zeros
    descriptors = np.array(compound_short_descriptors(compound)) # load descriptors 
    for index, x in np.ndenumerate(descriptors):
        array[index] = x
    ndf = pd.DataFrame(data=array)
    # clean up the data (get rid of strings and NaNs.)
    ndf.apply(pd.to_numeric)
    ndf = ndf.fillna(0)
    ndf = ndf.append([T])
    array = X_train_scaler.fit_transform(ndf)
    array = array.T
    return array

def predict(compound, T):
    print(str(model.predict(test_data(compound, T))[0][0]) + '  (uV/K)')

In [None]:
predict('Ca0.98La0.02MnO3', 300)

In [None]:
predict('CaMnO3', 500)

In [None]:
predict('FeSb2Ge0.1Te0.9', 500)

In [None]:
predict('MnMnMn', 600)
predict('MnMn', 600)
predict('Mn', 600)

In [None]:
predict('MnTe', 400)
predict('MnTeMnTe', 400)
predict('CaMnTe', 400)

Pretty smart!

COOL!
Gotta go to sleep. See you next time.