In [1]:
import numpy as np
import pandas as pd
from mp_api.client import MPRester
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from matminer.featurizers.structure import DensityFeatures
from matminer.featurizers.structure import StructuralComplexity
from matminer.featurizers.structure import MaximumPackingEfficiency

### Perovskites

In [2]:
with MPRester("EclqWMXn0DY3CAOSELD3xvCjEVhlcEYp") as mpr:
    perovskites = mpr.materials.summary.search(formula=["ABC3"], fields = ["material_id", "structure", "band_gap", "theoretical", "is_stable", "formula_pretty", "composition"])

Retrieving SummaryDoc documents:   0%|          | 0/4555 [00:00<?, ?it/s]

In [3]:
ids = []
for i in range(len(perovskites)):
    single = perovskites[i]
    ids.append(single.material_id)

structures = []
for i in range(len(perovskites)):
    single = perovskites[i]
    structures.append(single.structure)

band_gaps = []
for i in range(len(perovskites)):
    single = perovskites[i]
    band_gaps.append(single.band_gap)

theory = []
for i in range(len(perovskites)):
    single = perovskites[i]
    theory.append(single.theoretical)

stable = []
for i in range(len(perovskites)):
    single = perovskites[i]
    stable.append(single.is_stable)

formula = []
for i in range(len(perovskites)):
    single = perovskites[i]
    formula.append(single.formula_pretty)

composition = []
for i in range(len(perovskites)):
    single = perovskites[i]
    composition.append(single.composition)

perov_df = pd.DataFrame(
    {"material ids": ids,
     "formula": formula,
     "composition": composition,
     "structure": structures,
     "band gaps": band_gaps,
     "theoretical": theory,
     "stable": stable
    })

In [3]:
perov_df = pd.read_csv('perov_df.csv')

In [4]:
perov_df

Unnamed: 0,material ids,structure,band gaps,theoretical,stable
0,mp-1183115,Full Formula (Ac1 Al1 O3)\nReduced Formula: Ac...,4.1024,True,True
1,mp-1183052,Full Formula (Ac1 B1 O3)\nReduced Formula: AcB...,0.8071,True,False
2,mp-866101,Full Formula (Ac1 Cr1 O3)\nReduced Formula: Ac...,2.0031,True,True
3,mp-864606,Full Formula (Ac1 Cu1 O3)\nReduced Formula: Ac...,0.0000,True,True
4,mp-861502,Full Formula (Ac1 Fe1 O3)\nReduced Formula: Ac...,0.9888,True,True
...,...,...,...,...,...
4550,mp-20852,Full Formula (Nd4 Mn4 O12)\nReduced Formula: N...,1.8980,False,True
4551,mp-1079171,Full Formula (Nd2 Ni2 Ge6)\nReduced Formula: N...,0.0000,False,True
4552,mp-22106,Full Formula (Nd4 Ni4 O12)\nReduced Formula: N...,0.0000,False,True
4553,mp-571167,Full Formula (Nd12 Ni12 Sb36)\nReduced Formula...,0.0000,False,True


In [4]:
real_perovs = perov_df[perov_df["theoretical"]==False]

In [5]:
real_perovs["stable"] = real_perovs["stable"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_perovs["stable"] = real_perovs["stable"].astype(int)


In [6]:
perovs_feat = real_perovs

In [7]:
densityf = DensityFeatures()
strcomp = StructuralComplexity()
mpe = MaximumPackingEfficiency()

In [8]:
perovs_feat = densityf.featurize_dataframe(perovs_feat, "structure")

DensityFeatures:   0%|          | 0/1759 [00:00<?, ?it/s]

In [9]:
perovs_feat = strcomp.featurize_dataframe(perovs_feat, "structure")

StructuralComplexity:   0%|          | 0/1759 [00:00<?, ?it/s]

In [10]:
perovs_feat = mpe.featurize_dataframe(perovs_feat, "structure")

MaximumPackingEfficiency:   0%|          | 0/1759 [00:00<?, ?it/s]

In [11]:
perovs_feat.to_csv('perovs_feat.csv')

In [5]:
perovs_feat =  pd.read_csv('perovs_feat.csv')

In [3]:
y = perovs_feat["stable"].values
X = perovs_feat.drop(["Unnamed: 0","material ids", "formula", "composition", "structure", "band gaps", "theoretical", "stable"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

stable_classifier = RandomForestClassifier(random_state=0)
stable_classifier.fit(X_train, y_train)

In [7]:
perovs_feat

Unnamed: 0.1,Unnamed: 0,material ids,formula,composition,structure,band gaps,theoretical,stable,density,vpa,packing fraction,structural complexity per atom,structural complexity per cell,max packing efficiency
0,14,mp-1105645,Ag3SBr,Ag12 S4 Br4,Full Formula (Ag12 S4 Br4)\nReduced Formula: A...,0.2964,False,0,5.967646,24.240276,0.511803,2.521928,50.438562,0.394708
1,18,mp-22995,Ag3SI,Ag3 S1 I1,Full Formula (Ag3 S1 I1)\nReduced Formula: Ag3...,0.7261,False,0,6.576090,24.371110,0.551101,1.370951,6.854753,0.409055
2,19,mp-558189,Ag3SI,Ag3 S1 I1,Full Formula (Ag3 S1 I1)\nReduced Formula: Ag3...,0.6243,False,0,6.373934,25.144066,0.534159,1.370951,6.854753,0.446920
3,27,mp-558950,AgAsO3,Ag24 As24 O72,Full Formula (Ag24 As24 O72)\nReduced Formula:...,0.4985,False,0,5.320520,14.405829,0.364328,4.906891,588.826871,0.244454
4,33,mp-23548,AgBiO3,Ag2 Bi2 O6,Full Formula (Ag2 Bi2 O6)\nReduced Formula: Ag...,0.0000,False,0,7.836022,15.463008,0.478935,1.370951,13.709506,0.365136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1754,4549,mp-1189476,NdMnO3,Nd4 Mn4 O12,Full Formula (Nd4 Mn4 O12)\nReduced Formula: N...,0.0000,False,0,6.679080,12.290589,0.662787,2.521928,50.438562,0.383291
1755,4550,mp-20852,NdMnO3,Nd4 Mn4 O12,Full Formula (Nd4 Mn4 O12)\nReduced Formula: N...,1.8980,False,1,6.966900,11.782834,0.691349,1.921928,38.438562,0.392994
1756,4551,mp-1079171,NdNiGe3,Nd2 Ni2 Ge6,Full Formula (Nd2 Ni2 Ge6)\nReduced Formula: N...,0.0000,False,1,7.486044,18.670657,0.657412,2.321928,23.219281,0.457677
1757,4552,mp-22106,NdNiO3,Nd4 Ni4 O12,Full Formula (Nd4 Ni4 O12)\nReduced Formula: N...,0.0000,False,1,7.555034,11.030659,0.716950,1.921928,38.438562,0.402676


In [8]:
perovs_feat.drop(["Unnamed: 0","material ids", "formula", "composition", "structure", "band gaps", "theoretical", "stable"], axis=1)

Unnamed: 0,density,vpa,packing fraction,structural complexity per atom,structural complexity per cell,max packing efficiency
0,5.967646,24.240276,0.511803,2.521928,50.438562,0.394708
1,6.576090,24.371110,0.551101,1.370951,6.854753,0.409055
2,6.373934,25.144066,0.534159,1.370951,6.854753,0.446920
3,5.320520,14.405829,0.364328,4.906891,588.826871,0.244454
4,7.836022,15.463008,0.478935,1.370951,13.709506,0.365136
...,...,...,...,...,...,...
1754,6.679080,12.290589,0.662787,2.521928,50.438562,0.383291
1755,6.966900,11.782834,0.691349,1.921928,38.438562,0.392994
1756,7.486044,18.670657,0.657412,2.321928,23.219281,0.457677
1757,7.555034,11.030659,0.716950,1.921928,38.438562,0.402676


In [6]:
from sklearn.model_selection import GridSearchCV
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 3, 5],      # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}

In [7]:
grid_search = GridSearchCV(estimator=stable_classifier, param_grid=param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)
# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and the best F1 score
best_params = grid_search.best_params_
best_f1_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best F1 Score:", best_f1_score)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Best F1 Score: 0.628827372772919


In [8]:
best_rf_model = grid_search.best_estimator_

In [9]:
training_accuracy = best_rf_model.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy:.1f} %')
testing_accuracy = best_rf_model.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on training data: 98.4 %
Accuracy on testing data: 70.9 %


In [12]:
y_train_pred = best_rf_model.predict(X_train)
y_test_pred = best_rf_model.predict(X_test)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [98]:
print(f"F1 score for the training set is: {f1_score(y_train, y_train_pred)}")
print(f"F1 score for the test set is: {f1_score(y_test, y_test_pred)}")
print(f"Confusion matrix for reference for train:\n {conf_matrix_train}")
print(f"Confusion matrix for reference for test:\n {conf_matrix_test}")

F1 score for the training set is: 0.981549815498155
F1 score for the test set is: 0.6350148367952523
Confusion matrix for reference for train:
 [[767   8]
 [ 12 532]]
Confusion matrix for reference for test:
 [[210  48]
 [ 75 107]]


In [4]:
training_accuracy = stable_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy:.1f} %')
testing_accuracy = stable_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on training data: 100.0 %
Accuracy on testing data: 72.2 %


### Using Regularization

In [20]:
from sklearn.preprocessing import StandardScaler
# Feature scaling (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

This method is for Logistic Regression, please ignore.

## Evaluating the classifier

In [5]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [15]:
y_train_pred = cross_val_predict(stable_classifier, X_train, y_train, cv=3)
y_test_pred = cross_val_predict(stable_classifier, X_test, y_test, cv=3)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [6]:
y_train_pred = stable_classifier.predict(X_train)
y_test_pred = stable_classifier.predict(X_test)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [7]:
print(f"F1 score for the training set is: {f1_score(y_train, y_train_pred):.3f}")
print(f"F1 score for the test set is: {f1_score(y_test, y_test_pred):.3f}")
print(f"Confusion matrix for reference for train:\n {conf_matrix_train}")
print(f"Confusion matrix for reference for test:\n {conf_matrix_test}")

F1 score for the training set is: 1.000
F1 score for the test set is: 0.623
Confusion matrix for reference for train:
 [[826   0]
 [  0 581]]
Confusion matrix for reference for test:
 [[173  34]
 [ 64  81]]


## Dumping the model for deployment

In [34]:
from joblib import dump, load

In [42]:
dump(stable_classifier, 'perovskites.joblib')

['perovskites.joblib']

### III-V Semiconductors

In [33]:
with MPRester("EclqWMXn0DY3CAOSELD3xvCjEVhlcEYp") as mpr:
    iii_v = mpr.materials.summary.search(chemsys=["Al-N", "Al-P", "Al-As", "Al-Sb",
                                      "Ga-N", "Ga-P", "Ga-As", "Ga-Sb"
                                      "In-N", "In-P", "In-As", "In-Sb"], 
                              fields = ["material_id", "structure", "band_gap", "theoretical", "is_stable", "formula_pretty", "composition"])

Retrieving SummaryDoc documents:   0%|          | 0/117 [00:00<?, ?it/s]

In [34]:
ids = []
for i in range(len(iii_v)):
    single = iii_v[i]
    ids.append(single.material_id)

structures = []
for i in range(len(iii_v)):
    single = iii_v[i]
    structures.append(single.structure)

band_gaps = []
for i in range(len(iii_v)):
    single = iii_v[i]
    band_gaps.append(single.band_gap)

theory = []
for i in range(len(iii_v)):
    single = iii_v[i]
    theory.append(single.theoretical)

stable = []
for i in range(len(iii_v)):
    single = iii_v[i]
    stable.append(single.is_stable)

formula = []
for i in range(len(iii_v)):
    single = iii_v[i]
    formula.append(single.formula_pretty)

composition = []
for i in range(len(iii_v)):
    single = iii_v[i]
    composition.append(single.composition)

iii_v_df = pd.DataFrame(
    {"material ids": ids,
     "formula": formula,
     "composition": composition,
     "structure": structures,
     "band gaps": band_gaps,
     "theoretical": theory,
     "stable": stable
    })

In [35]:
iii_v_feat = iii_v_df

In [36]:
iii_v_feat = densityf.featurize_dataframe(iii_v_feat, "structure")

DensityFeatures:   0%|          | 0/117 [00:00<?, ?it/s]

In [21]:
iii_v_feat = strcomp.featurize_dataframe(iii_v_feat, "structure")

StructuralComplexity:   0%|          | 0/117 [00:00<?, ?it/s]

In [22]:
iii_v_feat = mpe.featurize_dataframe(iii_v_feat, "structure")

MaximumPackingEfficiency:   0%|          | 0/117 [00:00<?, ?it/s]

In [23]:
iii_v_feat.to_csv('iii_v_feat.csv')

In [9]:
iii_v_feat = pd.read_csv('iii_v_feat.csv')

In [26]:
y = iii_v_feat["stable"].values
X = iii_v_feat.drop(["material ids", "formula", "composition", "structure", "band gaps", "theoretical", "stable"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

stable_classifier = RandomForestClassifier(random_state=42)
stable_classifier.fit(X_train, y_train)

In [27]:
training_accuracy = stable_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy:.1f} %')
testing_accuracy = stable_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on training data: 100.0 %
Accuracy on testing data: 96.7 %


## Evaluating again

In [28]:
y_train_pred = cross_val_predict(stable_classifier, X_train, y_train, cv=3)
y_test_pred = cross_val_predict(stable_classifier, X_test, y_test, cv=3)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [29]:
y_train_pred = stable_classifier.predict(X_train)
y_test_pred = stable_classifier.predict(X_test)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [30]:
print(f"F1 score for the training set is: {f1_score(y_train, y_train_pred)}")
print(f"F1 score for the test set is: {f1_score(y_test, y_test_pred)}")
print(f"Confusion matrix for reference for train:\n {conf_matrix_train}")
print(f"Confusion matrix for reference for test:\n {conf_matrix_test}")

F1 score for the training set is: 1.0
F1 score for the test set is: 0.8
Confusion matrix for reference for train:
 [[80  0]
 [ 0  7]]
Confusion matrix for reference for test:
 [[27  0]
 [ 1  2]]


### II-VI Semiconductors

In [38]:
with MPRester("EclqWMXn0DY3CAOSELD3xvCjEVhlcEYp") as mpr:
    ii_vi = mpr.materials.summary.search(chemsys=["Zn-S", "Zn-Se", "Zn-Te",
                                        "Cd-S", "Cd-Se", "Cd-Te"], 
                               fields = ["material_id", "structure", "band_gap", "theoretical", "is_stable", "formula_pretty", "composition"])

Retrieving SummaryDoc documents:   0%|          | 0/185 [00:00<?, ?it/s]

In [39]:
ids = []
for i in range(len(ii_vi)):
    single = ii_vi[i]
    ids.append(single.material_id)

structures = []
for i in range(len(ii_vi)):
    single = ii_vi[i]
    structures.append(single.structure)

band_gaps = []
for i in range(len(ii_vi)):
    single = ii_vi[i]
    band_gaps.append(single.band_gap)

theory = []
for i in range(len(ii_vi)):
    single = ii_vi[i]
    theory.append(single.theoretical)

stable = []
for i in range(len(ii_vi)):
    single = ii_vi[i]
    stable.append(single.is_stable)

formula = []
for i in range(len(ii_vi)):
    single = ii_vi[i]
    formula.append(single.formula_pretty)

composition = []
for i in range(len(ii_vi)):
    single = ii_vi[i]
    composition.append(single.composition)

ii_vi_df = pd.DataFrame(
    {"material ids": ids,
     "formula": formula,
     "composition": composition,
     "structure": structures,
     "band gaps": band_gaps,
     "theoretical": theory,
     "stable": stable
    })

In [40]:
ii_vi_feat = ii_vi_df

In [41]:
ii_vi_feat = densityf.featurize_dataframe(ii_vi_feat, "structure")

DensityFeatures:   0%|          | 0/185 [00:00<?, ?it/s]

In [42]:
ii_vi_feat = strcomp.featurize_dataframe(ii_vi_feat, "structure")

StructuralComplexity:   0%|          | 0/185 [00:00<?, ?it/s]

In [43]:
ii_vi_feat = mpe.featurize_dataframe(ii_vi_feat, "structure")

MaximumPackingEfficiency:   0%|          | 0/185 [00:00<?, ?it/s]

In [45]:
ii_vi_feat.to_csv("ii_vi_feat.csv")

In [14]:
ii_vi_feat =  pd.read_csv('ii_vi_feat.csv')

In [71]:
y = ii_vi_feat["stable"].values
X = ii_vi_feat.drop(["material ids", "formula", "composition", "structure", "band gaps", "theoretical", "stable"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

stable_classifier = RandomForestClassifier(random_state=42,class_weight='balanced')
stable_classifier.fit(X_train, y_train)

In [72]:
from sklearn.ensemble import BaggingClassifier
bagging_rf = BaggingClassifier(estimator=stable_classifier, random_state=42)
bagging_rf.fit(X_train, y_train)

In [73]:
training_accuracy = stable_classifier.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy:.1f} %')
testing_accuracy = stable_classifier.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on training data: 100.0 %
Accuracy on testing data: 93.6 %


In [74]:
training_accuracy = bagging_rf.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy:.1f} %')
testing_accuracy = bagging_rf.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on training data: 98.6 %
Accuracy on testing data: 93.6 %


## Evaluating

In [33]:
y_train_pred = cross_val_predict(stable_classifier, X_train, y_train, cv=3)
y_test_pred = cross_val_predict(stable_classifier, X_test, y_test, cv=2)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [61]:
y_train_pred = bagging_rf.predict(X_train)
y_test_pred = bagging_rf.predict(X_test)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [62]:
print(f"F1 score for the training set is: {f1_score(y_train, y_train_pred)}")
print(f"F1 score for the test set is: {f1_score(y_test, y_test_pred)}")
print(f"Confusion matrix for reference for train:\n {conf_matrix_train}")
print(f"Confusion matrix for reference for test:\n {conf_matrix_test}")

F1 score for the training set is: 0.8
F1 score for the test set is: 0.0
Confusion matrix for reference for train:
 [[132   0]
 [  2   4]]
Confusion matrix for reference for test:
 [[44  1]
 [ 2  0]]


In [84]:
pd.DataFrame(y).shape

(185, 1)

In [83]:
pd.DataFrame(X).value_counts()

Unnamed: 0  density   vpa        packing fraction  structural complexity per atom  structural complexity per cell  max packing efficiency
0           6.408285  17.826874  0.522927          0.811278                        3.245112                        0.740480                  1
127         4.138627  19.554718  0.370621          2.584963                        15.509775                       0.339764                  1
118         4.134738  19.573113  0.370273          5.700440                        296.422865                      0.339804                  1
119         4.140940  19.543795  0.370828          5.584963                        268.078200                      0.339885                  1
120         4.138446  19.555578  0.370605          3.000000                        24.000000                       0.339778                  1
                                                                                                                                            ..
64  

In [85]:
from sklearn.model_selection import GridSearchCV
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 3, 5],      # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}

In [86]:
grid_search = GridSearchCV(estimator=stable_classifier, param_grid=param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

In [87]:
# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and the best F1 score
best_params = grid_search.best_params_
best_f1_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best F1 Score:", best_f1_score)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best F1 Score: 0.5


In [88]:
best_rf_model = grid_search.best_estimator_

In [89]:
training_accuracy = best_rf_model.score(X_train, y_train) * 100
print(f'Accuracy on training data: {training_accuracy:.1f} %')
testing_accuracy = best_rf_model.score(X_test, y_test) * 100
print(f'Accuracy on testing data: {testing_accuracy:.1f} %')

Accuracy on training data: 94.9 %
Accuracy on testing data: 95.7 %


In [90]:
y_train_pred = best_rf_model.predict(X_train)
y_test_pred = best_rf_model.predict(X_test)
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [91]:
print(f"F1 score for the training set is: {f1_score(y_train, y_train_pred)}")
print(f"F1 score for the test set is: {f1_score(y_test, y_test_pred)}")
print(f"Confusion matrix for reference for train:\n {conf_matrix_train}")
print(f"Confusion matrix for reference for test:\n {conf_matrix_test}")

F1 score for the training set is: 0.631578947368421
F1 score for the test set is: 0.5
Confusion matrix for reference for train:
 [[125   7]
 [  0   6]]
Confusion matrix for reference for test:
 [[44  1]
 [ 1  1]]


In [25]:
print(pd.__version__)

1.5.3


In [27]:
import sklearn
print(sklearn.__version__)

1.4.1.post1


In [2]:
perovs_feat = pd.read_csv("perovs_feat.csv")

In [12]:
%%capture
import re

# Function to check for repetitive elements in a chemical formula
def check_repetitive_elements(formula):
    elements = re.findall('[A-Z][a-z]?', formula)  # Extract elements from the formula
    element_count = {}
    for element in elements:
        if element in element_count:
            element_count[element] += 1
        else:
            element_count[element] = 1

    repetitive_elements = [element for element, count in element_count.items() if count > 1]
    return repetitive_elements

# Function to check if each element is matched with other elements
def check_matching_elements(formula):
    elements = re.findall('[A-Z][a-z]?', formula)  # Extract elements from the formula
    unique_elements = set(elements)
    matched = all(len(element) == 1 for element in unique_elements)
    return matched

# Iterate through each material in the dataframe
for index, row in perovs_feat.iterrows():
    chemical_formula = row['formula']

    # Check for repetitive elements
    repetitive_elements = check_repetitive_elements(chemical_formula)
    if repetitive_elements:
        print(f"Material '{chemical_formula}' has repetitive elements: {', '.join(repetitive_elements)}")

    # Check if each element is matched with other elements
    matched_elements = check_matching_elements(chemical_formula)
    if not matched_elements:
        print(f"Material '{chemical_formula}' has unmatched elements.")

    # You can add further processing or logging here as per your requirement

In [13]:
# Function to check for repetitive elements in a chemical formula
def check_repetitive_elements(formula):
    elements = re.findall('[A-Z][a-z]?', formula)  # Extract elements from the formula
    element_count = {}
    for element in elements:
        if element in element_count:
            element_count[element] += 1
        else:
            element_count[element] = 1

    repetitive_elements = [element for element, count in element_count.items() if count > 1]
    return repetitive_elements

# Function to check if each element is matched with other elements
def check_matching_elements(formula):
    elements = re.findall('[A-Z][a-z]?', formula)  # Extract elements from the formula
    unique_elements = set(elements)
    matched = all(len(element) == 1 for element in unique_elements)
    return matched

# Iterate through each material in the dataframe
for index, row in perovs_feat.iterrows():
    chemical_formula = str(row[0])  # Ensure the value is treated as string

    # Remove numbers from the formula
    chemical_formula = re.sub(r'\d+', '', chemical_formula)

    # Check for repetitive elements
    repetitive_elements = check_repetitive_elements(chemical_formula)
    if repetitive_elements:
        print(f"Formula '{chemical_formula}' has repetitive elements: {', '.join(repetitive_elements)}")

    # Check if each element is matched with other elements
    matched_elements = check_matching_elements(chemical_formula)
    if not matched_elements:
        print(f"Formula '{chemical_formula}' has unmatched elements.")

In [15]:
perovs_feat["formula"].unique()

array(['Ag3SBr', 'Ag3SI', 'AgAsO3', ..., 'NdNiGe3', 'NdNiO3', 'NdNiSb3'],
      dtype=object)

In [19]:
from collections import Counter

# Sample list of chemical formulas
chemical_formulas = [
    "H2O",
    "H2O2",
    "CH4",
    "C6H12O6",
    "NaCl",
    "C6H12O6",
    "CH3OH",
    "Na2SO4"
]

# Function to check for repetitive elements across all formulas
def check_repetitive_elements(formulas):
    all_elements = []
    repetitive_elements = []
    
    # Iterate over each formula
    for formula in formulas:
        # Extract elements from the formula using regular expression
        elements = re.findall('[A-Z][a-z]*', formula)
        all_elements.extend(elements)
    
    # Count occurrences of each element
    element_counts = Counter(all_elements)
    
    # Identify which elements are repeated across different formulas
    for element, count in element_counts.items():
        if count > 1:
            repetitive_elements.append(element)
    
    return repetitive_elements

# Check for repetitive elements across all formulas
repetitive_elements = check_repetitive_elements(chemical_formulas)
print("Repetitive Elements:", repetitive_elements)

Repetitive Elements: ['H', 'O', 'C', 'Na']


In [23]:
from collections import Counter

# Sample list of chemical formulas
chemical_formulas = perovs_feat["formula"].unique()

# Function to check for repetitive elements across all formulas
def check_repetitive_elements(formulas):
    all_elements = []
    repetitive_elements = {}
    
    # Iterate over each formula
    for formula in formulas:
        # Extract elements from the formula using regular expression
        elements = re.findall('[A-Z][a-z]*', formula)
        all_elements.extend(elements)
    
    # Count occurrences of each element
    element_counts = Counter(all_elements)
    
    # Identify which elements are repeated across different formulas
    for element, count in element_counts.items():
        if count > 1:
            repetitive_elements[element] = [formula for formula in formulas if element in formula]
    
    return repetitive_elements

# Check for repetitive elements across all formulas
repetitive_elements = check_repetitive_elements(chemical_formulas)
print("Repetitive Elements and their matching formulas:")
for element, formulas in repetitive_elements.items():
    print(f"{element}: {formulas}")
    print(f'{element} has {len(formulas)} repetitions')
    print('\n\n')

Repetitive Elements and their matching formulas:
Ag: ['Ag3SBr', 'Ag3SI', 'AgAsO3', 'AgBiO3', 'AgClO3', 'AgIO3', 'AgNO3', 'AgPO3', 'AgPS3', 'AgPd3Se', 'AgSbO3', 'AgSe3I', 'AgTe3I', 'AgTeO3', 'CsAgCl3', 'CsAgF3', 'CuAgF3', 'KAgF3', 'VAgO3', 'ZnAgF3', 'RbAgF3', 'TaAgO3', 'TaAgS3', 'Mn3AgN', 'NbAgO3']
Ag has 25 repetitions



S: ['Ag3SBr', 'Ag3SI', 'AgPS3', 'AgPd3Se', 'AgSbO3', 'AgSe3I', 'AlSBr3', 'AlSiP3', 'Ba3SbN', 'Ba3SnO', 'BaBiSe3', 'BaCaSn3', 'BaGeS3', 'BaHfS3', 'BaNiSn3', 'BaPS3', 'BaPSe3', 'BaSbTe3', 'BaSeO3', 'BaSi3Pd', 'BaSi3Pt', 'BaSiO3', 'BaSn3Au', 'BaSn3Pd', 'BaSn3Pt', 'BaSnO3', 'BaSnS3', 'BaTaS3', 'BaTaSe3', 'BaTeS3', 'BaTiS3', 'BaTiSe3', 'BaUS3', 'BaVS3', 'BaVSe3', 'BaZrS3', 'BaZrSe3', 'CS3N', 'Ca3SbN', 'Ca3SiO', 'Ca3SnN', 'Ca3SnO', 'CaHfS3', 'CaPS3', 'CaPSe3', 'CaSeO3', 'CaSi3Ir', 'CaSi3Pt', 'CaSiO3', 'CaSnO3', 'CaZrS3', 'CdPS3', 'CdPSe3', 'CdSO3', 'CdSeO3', 'CdSiO3', 'CdSnO3', 'Ce3SnC', 'CeBS3', 'CeCoSb3', 'CeCrS3', 'CeCrSe3', 'CeDyS3', 'CeErS3', 'CeLuS3', 'CeNiSb3', 'CeSb

In [21]:
len(repetitive_elements)

80

In [26]:
# Function to find elements in A, B, and C
def find_elements(formulas):
    elements_A = set()
    elements_B = set()
    elements_C = set()
    
    # Iterate over each formula
    for formula in formulas:
        # Extract elements from the formula using regular expression
        elements = re.findall('[A-Z][a-z]*', formula)
        if len(elements) == 3:
            elements_A.add(elements[0])
            elements_B.add(elements[1])
            elements_C.add(elements[2])
    
    print("Elements in A:")
    print(elements_A)
    print("Elements in B:")
    print(elements_B)
    print("Elements in C:")
    print(elements_C)

# Find and print elements in A, B, and C
find_elements(chemical_formulas)

Elements in A:
{'Y', 'Zr', 'Pa', 'Ho', 'Cd', 'Li', 'Ce', 'Pd', 'Tl', 'Pb', 'Ga', 'Mn', 'Be', 'Ge', 'Th', 'In', 'Co', 'Au', 'P', 'Ba', 'Br', 'Tc', 'H', 'Eu', 'Hg', 'Sm', 'Mg', 'Pu', 'Te', 'W', 'Ca', 'Zn', 'Ni', 'Nb', 'K', 'Pt', 'Al', 'Gd', 'U', 'Sc', 'Bi', 'As', 'Cs', 'Os', 'Sn', 'Cu', 'Nd', 'Ta', 'Er', 'V', 'Cr', 'Re', 'Tb', 'Si', 'C', 'Tm', 'Fe', 'N', 'Sr', 'I', 'Na', 'Pr', 'Dy', 'Ti', 'Rb', 'Sb', 'Mo', 'La', 'Lu', 'Ag', 'Hf', 'S'}
Elements in B:
{'Pa', 'Zr', 'Y', 'Ho', 'Cd', 'Ce', 'Ru', 'Pd', 'Tl', 'Pb', 'Mn', 'Ga', 'Be', 'Ge', 'Th', 'In', 'Co', 'Au', 'P', 'Br', 'O', 'Tc', 'H', 'Eu', 'Hg', 'Sm', 'Pu', 'Te', 'W', 'Ca', 'Nb', 'Ni', 'Zn', 'Pt', 'Li', 'Al', 'Sc', 'Rh', 'U', 'Bi', 'As', 'Ta', 'Sn', 'Os', 'Cu', 'Hf', 'Tb', 'V', 'Cr', 'Er', 'Re', 'Si', 'Np', 'C', 'Tm', 'Fe', 'S', 'Cl', 'N', 'Sr', 'I', 'Na', 'Ir', 'Pr', 'Dy', 'Ti', 'Sb', 'Mo', 'Mg', 'Lu', 'Ag', 'Se', 'B'}
Elements in C:
{'Ru', 'Pd', 'Pb', 'Ge', 'In', 'F', 'Au', 'P', 'Br', 'H', 'Te', 'Ni', 'Pt', 'O', 'Al', 'Rh', 'As', 'Sn', '

In [60]:
# Function to find elements in A, B, and C and print their intersections
def find_elements(formulas):
    elements_A = set()
    elements_B = set()
    elements_C = set()
    
    # Iterate over each formula
    for formula in formulas:
        # Extract elements from the formula using regular expression
        elements = re.findall('[A-Z][a-z]*', formula)
        if len(elements) == 3:
            elements_A.add(elements[0])
            elements_B.add(elements[1])
            elements_C.add(elements[2])
    
    print("Elements in A:")
    print(elements_A)
    print(len(elements_A))
    print("Elements in B:")
    print(elements_B)
    print(len(elements_B))
    print("Elements in C:")
    print(elements_C)
    print(len(elements_C))
    
    print("\nMatching elements:")
    print(f"{len(elements_A.intersection(elements_B))} Elements in A that match with B:")
    print(elements_A.intersection(elements_B))
    print(f"{len(elements_A.intersection(elements_C))} Elements in A that match with C:")
    print(elements_A.intersection(elements_C))
    print(f"{len(elements_B.intersection(elements_C))} Elements in B that match with C:")
    print(elements_B.intersection(elements_C))

    print("\nElements that don't match:")
    print(f"{len(elements_A.difference(elements_B))} Elements in A that don't match with B:")
    print(elements_A.difference(elements_B))
    print(f"{len(elements_A.difference(elements_C))} Elements in A that don't match with C:")
    print(elements_A.difference(elements_C))
    print(f"{len(elements_B.difference(elements_C))} Elements in B that don't match with C:")
    print(elements_B.difference(elements_C))

    print(f"{len(elements_C.difference(elements_A))} Elements in C that don't match with A:")
    print(elements_C.difference(elements_A))
    print(f"{len(elements_C.difference(elements_B))} Elements in C that don't match with B:")
    print(elements_C.difference(elements_B))

# Find and print elements in A, B, and C and their intersections
find_elements(chemical_formulas)

Elements in A:
{'Y', 'Zr', 'Pa', 'Ho', 'Cd', 'Li', 'Ce', 'Pd', 'Tl', 'Pb', 'Ga', 'Mn', 'Be', 'Ge', 'Th', 'In', 'Co', 'Au', 'P', 'Ba', 'Br', 'Tc', 'H', 'Eu', 'Hg', 'Sm', 'Mg', 'Pu', 'Te', 'W', 'Ca', 'Zn', 'Ni', 'Nb', 'K', 'Pt', 'Al', 'Gd', 'U', 'Sc', 'Bi', 'As', 'Cs', 'Os', 'Sn', 'Cu', 'Nd', 'Ta', 'Er', 'V', 'Cr', 'Re', 'Tb', 'Si', 'C', 'Tm', 'Fe', 'N', 'Sr', 'I', 'Na', 'Pr', 'Dy', 'Ti', 'Rb', 'Sb', 'Mo', 'La', 'Lu', 'Ag', 'Hf', 'S'}
72
Elements in B:
{'Pa', 'Zr', 'Y', 'Ho', 'Cd', 'Ce', 'Ru', 'Pd', 'Tl', 'Pb', 'Mn', 'Ga', 'Be', 'Ge', 'Th', 'In', 'Co', 'Au', 'P', 'Br', 'O', 'Tc', 'H', 'Eu', 'Hg', 'Sm', 'Pu', 'Te', 'W', 'Ca', 'Nb', 'Ni', 'Zn', 'Pt', 'Li', 'Al', 'Sc', 'Rh', 'U', 'Bi', 'As', 'Ta', 'Sn', 'Os', 'Cu', 'Hf', 'Tb', 'V', 'Cr', 'Er', 'Re', 'Si', 'Np', 'C', 'Tm', 'Fe', 'S', 'Cl', 'N', 'Sr', 'I', 'Na', 'Ir', 'Pr', 'Dy', 'Ti', 'Sb', 'Mo', 'Mg', 'Lu', 'Ag', 'Se', 'B'}
73
Elements in C:
{'Ru', 'Pd', 'Pb', 'Ge', 'In', 'F', 'Au', 'P', 'Br', 'H', 'Te', 'Ni', 'Pt', 'O', 'Al', 'Rh', 'As', '

In [52]:
chemical_formulas

array(['Ag3SBr', 'Ag3SI', 'AgAsO3', ..., 'NdNiGe3', 'NdNiO3', 'NdNiSb3'],
      dtype=object)

In [61]:
import re
def remove_numbers(input_string):
    return re.sub(r'\d+', '', input_string)

a = (input("Input 1st element:")).capitalize()
b = (input("Input 2nd element:")).capitalize()
c = (input("Input 3rd element:")).capitalize()
input_combination = a+b+c

def check_element_exist(input_combination):
    for formula in perovs_feat['formula']:
        if remove_numbers(formula) == input_combination:
            return formula
    return False
status = check_element_exist(input_combination)
if status == False:
    print("Combination not found.")
elif status != False:
    print(status)

Input 1st element: ag
Input 2nd element: s
Input 3rd element: br


Ag3SBr
