## Installing packages

I would make a new environment, open JupyterLab, then open a terminal to install the necessary packages. <br>
You need to run: <br>
pip install mp-api <br>
pip install matminer <br>
pip install pymatgen==2023.12.18 <br>
mp-api will automatically install pymatgen, but the most recent version is not compatible with matminer so you need to downgrade it to an older version (like 18/12/2023),

## Importing basic packages and Material Project API

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from mp_api.client import MPRester

## Filling dataframe with perovskite data

In [3]:
with MPRester("EclqWMXn0DY3CAOSELD3xvCjEVhlcEYp") as mpr:
    perovskites = mpr.materials.summary.search(formula=["ABC3"], fields = ["material_id", "structure", "band_gap", "theoretical", "is_stable"])
# mpr.materials.summary works similar to mpr.summary

Retrieving SummaryDoc documents:   0%|          | 0/4555 [00:00<?, ?it/s]

In [3]:
with MPRester("EclqWMXn0DY3CAOSELD3xvCjEVhlcEYp") as mpr:
    perovskites = mpr.summary.search(formula=["ABC3"], fields = ["material_id", "structure", "band_gap", "theoretical", "is_stable"])

  perovskites = mpr.summary.search(formula=["ABC3"], fields = ["material_id", "structure", "band_gap", "theoretical", "is_stable"])


Retrieving SummaryDoc documents:   0%|          | 0/4555 [00:00<?, ?it/s]

In [4]:
ids = []
for i in range(4555):
    single = perovskites[i]
    ids.append(single.material_id)

In [5]:
structures = []
for i in range(4555):
    single = perovskites[i]
    structures.append(single.structure)

In [6]:
band_gaps = []
for i in range(4555):
    single = perovskites[i]
    band_gaps.append(single.band_gap)

In [7]:
theory = []
for i in range(4555):
    single = perovskites[i]
    theory.append(single.theoretical)

In [8]:
stable = []
for i in range(4555):
    single = perovskites[i]
    stable.append(single.is_stable)

In [9]:
perov_df = pd.DataFrame(
    {"material ids": ids,
     "structure": structures,
     "band gaps": band_gaps,
     "theoretical": theory,
     "stable": stable
    })

In [10]:
perov_df

Unnamed: 0,material ids,structure,band gaps,theoretical,stable
0,mp-1183115,"[[0. 0. 0.] Ac, [1.92931693 1.92931693 1.92931...",4.1024,True,True
1,mp-1183052,"[[0. 0. 0.] Ac, [1.860834 1.860834 1.860834] B...",0.8071,True,False
2,mp-866101,"[[0. 0. 0.] Ac, [1.97214345 1.97215113 1.97213...",2.0031,True,True
3,mp-864606,"[[0. 0. 0.] Ac, [1.9566578 1.9566578 1.9566578...",0.0000,True,True
4,mp-861502,"[[0. 0. 0.] Ac, [1.97678086 1.9767782 1.97678...",0.9888,True,True
...,...,...,...,...,...
4550,mp-20325,"[[2.11778054 2.11778054 0. ] Ti, [2.117...",0.0000,False,True
4551,mp-21233,"[[0. 2.09394211 2.09394211] Ti, [2.093...",0.0000,False,False
4552,mp-760414,[[1.14486018e+00 3.98291320e+00 1.08399671e-04...,0.0000,True,False
4553,mp-644483,"[[2.08307401 0. 2.08249963] Ti, [0. ...",0.0000,False,True


In [10]:
perov_df

Unnamed: 0,material ids,structure,band gaps,theoretical,stable
0,mp-1183115,"[[0. 0. 0.] Ac, [1.92931693 1.92931693 1.92931...",4.1024,True,True
1,mp-1183052,"[[0. 0. 0.] Ac, [1.860834 1.860834 1.860834] B...",0.8071,True,False
2,mp-866101,"[[0. 0. 0.] Ac, [1.97214345 1.97215113 1.97213...",2.0031,True,True
3,mp-864606,"[[0. 0. 0.] Ac, [1.9566578 1.9566578 1.9566578...",0.0000,True,True
4,mp-861502,"[[0. 0. 0.] Ac, [1.97678086 1.9767782 1.97678...",0.9888,True,True
...,...,...,...,...,...
4550,mp-20852,"[[2.78317726 2.5080982 1.93598463] Nd, [5.375...",1.8980,False,True
4551,mp-1079171,"[[ 2.06204844 -3.65368509 2.08529374] Nd, [2....",0.0000,False,True
4552,mp-22106,"[[0.04398497 5.1712759 5.70338325] Nd, [2.647...",0.0000,False,True
4553,mp-571167,"[[ 2.76233118 8.83858657 13.75898303] Nd, [ 6...",0.0000,False,True


In [11]:
perov_df.to_csv('perov_df.csv', index=False)

## Manipulating data - removing theoretical materials, converting "stable" into 1s and 0s

In [13]:
real_perovs = perov_df[perov_df["theoretical"]==False]

In [14]:
real_perovs

Unnamed: 0,material ids,structure,band gaps,theoretical,stable
14,mp-1105645,"[[7.48401876 2.10527392 7.43781975] Ag, [2.313...",0.2964,False,False
18,mp-22995,"[[ 1.94462322 -1.74755883 1.89344403] Ag, [1....",0.7261,False,False
19,mp-558189,"[[2.504793 2.504793 0. ] Ag, [2.504793 0....",0.6243,False,False
27,mp-558950,"[[ 3.68937808 8.09573714 12.24244424] Ag, [0....",0.4985,False,False
33,mp-23548,[[ 2.01581009e+00 -2.21198671e-17 6.28702944e...,0.0000,False,False
...,...,...,...,...,...
4549,mp-1189476,"[[2.66383663 3.14619771 1.96305558] Nd, [5.413...",0.0000,False,False
4550,mp-20852,"[[2.78317726 2.5080982 1.93598463] Nd, [5.375...",1.8980,False,True
4551,mp-1079171,"[[ 2.06204844 -3.65368509 2.08529374] Nd, [2....",0.0000,False,True
4552,mp-22106,"[[0.04398497 5.1712759 5.70338325] Nd, [2.647...",0.0000,False,True


In [15]:
real_perovs["stable"] = real_perovs["stable"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_perovs["stable"] = real_perovs["stable"].astype(int)


In [16]:
real_perovs

Unnamed: 0,material ids,structure,band gaps,theoretical,stable
14,mp-1105645,"[[7.48401876 2.10527392 7.43781975] Ag, [2.313...",0.2964,False,0
18,mp-22995,"[[ 1.94462322 -1.74755883 1.89344403] Ag, [1....",0.7261,False,0
19,mp-558189,"[[2.504793 2.504793 0. ] Ag, [2.504793 0....",0.6243,False,0
27,mp-558950,"[[ 3.68937808 8.09573714 12.24244424] Ag, [0....",0.4985,False,0
33,mp-23548,[[ 2.01581009e+00 -2.21198671e-17 6.28702944e...,0.0000,False,0
...,...,...,...,...,...
4549,mp-1189476,"[[2.66383663 3.14619771 1.96305558] Nd, [5.413...",0.0000,False,0
4550,mp-20852,"[[2.78317726 2.5080982 1.93598463] Nd, [5.375...",1.8980,False,1
4551,mp-1079171,"[[ 2.06204844 -3.65368509 2.08529374] Nd, [2....",0.0000,False,1
4552,mp-22106,"[[0.04398497 5.1712759 5.70338325] Nd, [2.647...",0.0000,False,1


## Featurizing data using packages from Matminer

I chose a completely random feature that can be generated from structure, will be experimenting with lots of different ones. <br>
Can be found at: https://hackingmaterials.lbl.gov/matminer/featurizer_summary.html

In [17]:
from matminer.featurizers.structure import XRDPowderPattern
xrd = XRDPowderPattern()

In [18]:
real_perovs_xrd = xrd.featurize_dataframe(real_perovs, "structure")

XRDPowderPattern:   0%|          | 0/1759 [00:00<?, ?it/s]

In [12]:
real_perovs_xrd = pd.read_csv('real_perovs_xrd.csv')

In [13]:
real_perovs_xrd

Unnamed: 0,material ids,structure,band gaps,theoretical,stable,xrd_0,xrd_1,xrd_2,xrd_3,xrd_4,...,xrd_118,xrd_119,xrd_120,xrd_121,xrd_122,xrd_123,xrd_124,xrd_125,xrd_126,xrd_127
0,mp-1105645,Full Formula (Ag12 S4 Br4)\nReduced Formula: A...,0.2964,False,0,2.478322e-26,7.762754e-23,1.254368e-19,1.045647e-16,4.496728e-14,...,5.480407e-07,6.226729e-08,0.000001,0.000021,0.000183,0.000939,0.002913,0.005379,0.005676,3.291362e-03
1,mp-22995,Full Formula (Ag3 S1 I1)\nReduced Formula: Ag3...,0.7261,False,0,2.061098e-37,1.139955e-33,3.841217e-30,7.885731e-27,9.862955e-24,...,4.197247e-03,1.812563e-03,0.000485,0.000082,0.000037,0.000210,0.000945,0.002605,0.004397,4.545096e-03
2,mp-558189,Full Formula (Ag3 S1 I1)\nReduced Formula: Ag3...,0.6243,False,0,3.359836e-86,9.774339e-80,1.554275e-73,1.350951e-67,6.418341e-62,...,4.348214e-04,2.016603e-03,0.005112,0.007084,0.005365,0.002221,0.000503,0.000062,0.000004,1.553321e-07
3,mp-558950,Full Formula (Ag24 As24 O72)\nReduced Formula:...,0.4985,False,0,2.928741e-23,5.345314e-20,4.845123e-17,2.181101e-14,4.876252e-12,...,2.167607e-03,2.228740e-03,0.002280,0.002234,0.002503,0.002920,0.002826,0.002212,0.001436,7.567850e-04
4,mp-23548,Full Formula (Ag2 Bi2 O6)\nReduced Formula: Ag...,0.0000,False,0,4.621298e-32,1.246952e-28,2.058293e-25,2.078434e-22,1.283916e-19,...,3.968101e-03,2.866507e-03,0.002318,0.003033,0.004262,0.004524,0.003224,0.001471,0.000420,7.436288e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1754,mp-1189476,Full Formula (Nd4 Mn4 O12)\nReduced Formula: N...,0.0000,False,0,4.024008e-50,1.523183e-45,3.326686e-41,4.192162e-37,3.048112e-33,...,2.720574e-03,2.849249e-03,0.003757,0.004492,0.004112,0.003207,0.002657,0.002230,0.001485,6.783342e-04
1755,mp-20852,Full Formula (Nd4 Mn4 O12)\nReduced Formula: N...,1.8980,False,1,3.017360e-60,2.631816e-55,1.379515e-50,4.345677e-46,8.227486e-42,...,3.807744e-03,3.928954e-03,0.003471,0.002783,0.002942,0.003726,0.004412,0.004628,0.003932,2.427809e-03
1756,mp-1079171,Full Formula (Nd2 Ni2 Ge6)\nReduced Formula: N...,0.0000,False,1,7.766112e-12,5.162759e-10,1.982200e-08,4.395421e-07,5.629125e-06,...,8.509832e-04,6.608937e-04,0.000714,0.001633,0.003422,0.004690,0.003991,0.002062,0.000636,1.155609e-04
1757,mp-22106,Full Formula (Nd4 Ni4 O12)\nReduced Formula: N...,0.0000,False,1,1.314500e-64,2.031345e-59,1.861787e-54,1.012046e-49,3.262827e-45,...,1.041261e-02,1.126488e-02,0.007685,0.003383,0.001215,0.001344,0.003030,0.004591,0.004151,2.227070e-03


In [20]:
real_perovs_xrd.to_csv('real_perovs_xrd.csv', index=False)

In [14]:
y = real_perovs_xrd["stable"].values
X = real_perovs_xrd.drop(["material ids", "structure", "band gaps", "theoretical", "stable"], axis=1)

## Making and testing a basic classifying model

In [15]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1319, 128), (440, 128), (1319,), (440,))

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
stable_classifier = RandomForestClassifier(random_state=42)
stable_classifier.fit(X_train, y_train)

In [22]:
training_accuracy = stable_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy:.1f} %')

Accuracy on training data: 100.0 %


### might be overfitting.

In [23]:
testing_accuracy = stable_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on testing data: 67.3 %


# What next?

Try different features and different models for predicting stability in these materials first. <br>
Can move on to stability in other materials or other properties of perovskites once initial models are optimised. <br>

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [26]:
predictions = log_reg.predict(X_test)

In [27]:
training_accuracy_log_reg = log_reg.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy_log_reg:.1f} %')

Accuracy on training data: 60.0 %


In [28]:
testing_accuracy = log_reg.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on testing data: 59.5 %


# Decision Tree Classifier

In [29]:
from sklearn.tree import DecisionTreeClassifier

In [30]:
tree_classifier = DecisionTreeClassifier(max_depth=3)
tree_classifier.fit(X_train, y_train)

In [31]:
predictions_dec_tree = tree_classifier.predict(X_test)

In [32]:
training_accuracy_dec_tree = tree_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy_dec_tree:.1f} %')

Accuracy on training data: 68.8 %


In [33]:
testing_accuracy_dec_tree = tree_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on testing data: 59.5 %


# Support Vector Machine (SVM)

In [34]:
from sklearn.svm import SVC

In [35]:
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train, y_train)

In [36]:
predictions_svm = svm_classifier.predict(X_test)

In [37]:
training_accuracy_svm = svm_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy_svm:.1f} %')

Accuracy on training data: 58.8 %


In [38]:
testing_accuracy_svm = svm_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy_svm:.1f} %')

Accuracy on testing data: 58.6 %


# K nearest neighbours

In [39]:
from sklearn.neighbors import KNeighborsClassifier

In [40]:
k = 5
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

In [41]:
predictions_knn = knn_classifier.predict(X_test)

In [42]:
training_accuracy_knn = knn_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy_knn:.1f} %')

Accuracy on training data: 77.9 %


In [43]:
testing_accuracy_knn = knn_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy_knn:.1f} %')

Accuracy on testing data: 60.9 %


# Gradient Boosting Machines

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [45]:
gbm_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm_classifier.fit(X_train, y_train)

In [46]:
predictions_gbm = gbm_classifier.predict(X_test)

In [47]:
training_accuracy_gbm = gbm_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy_gbm:.1f} %')

Accuracy on training data: 91.6 %


In [48]:
testing_accuracy_gbm = gbm_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy_gbm:.1f} %')

Accuracy on testing data: 64.8 %


# AdaBoost

In [49]:
from sklearn.ensemble import AdaBoostClassifier

In [50]:
ada_classifier = AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42)
ada_classifier.fit(X_train, y_train)



In [51]:
predictions_ada = ada_classifier.predict(X_test)

In [52]:
training_accuracy_ada = ada_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy_ada:.1f} %')

Accuracy on training data: 83.9 %


In [53]:
testing_accuracy_ada = ada_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy_ada:.1f} %')

Accuracy on testing data: 60.0 %


# XGBoost

In [54]:
from xgboost import XGBClassifier

In [55]:
xgb_classifier = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
xgb_classifier.fit(X_train, y_train)

In [56]:
predictions_xgb = xgb_classifier.predict(X_test)

In [57]:
training_accuracy_xgb = xgb_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy_xgb:.1f} %')

Accuracy on training data: 90.3 %


In [58]:
testing_accuracy_xgb = xgb_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy_xgb:.1f} %')

Accuracy on testing data: 63.2 %


# Deep Learning

In [60]:
import tensorflow as tf
from tensorflow import keras




## F1 score

In [60]:
from sklearn.metrics import f1_score

In [64]:
predictions_random_forest = stable_classifier.predict(X_test)

In [66]:
from sklearn.model_selection import cross_val_predict

In [68]:
y_train_pred = cross_val_predict(stable_classifier, X_train, y_train, cv=3)

In [69]:
f1_score(y_train, y_train_pred)

0.5240761478163494

In [70]:
y_test_pred = cross_val_predict(stable_classifier, X_test, y_test, cv=3)

In [71]:
f1_score(y_test, y_test_pred)

0.44368600682593856

In [72]:
from sklearn.metrics import confusion_matrix

In [73]:
conf_matrix = confusion_matrix(y_test, y_test_pred)

In [74]:
print(conf_matrix)

[[212  46]
 [117  65]]
