In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import xgboost as xgb


Combine The Features and the labels

In [2]:
###Merged the features with the labels from "checker" 
###If checker=1, this is a good structure###
###If checker=0, this structure was removed###


df = pd.read_csv('Bare_MOF_PSED_Labels_infeasible_Oxo.csv')
del df['E_MOF (eV)']
del df['E_MOF_O (eV)']
del df['E_MOF_OH (eV)']

df_features = pd.read_csv('Features_RACS.csv')

merged_df = pd.merge(df_features, df[['MOF Name', 'Metal_index', 'checker']], on=['MOF Name', 'Metal_index'], how='inner')

merged_df.to_csv('merged_df_out.csv')

#df_col=merged_df.iloc[:, 4:-2]
#print(df_col)

xgboost

In [8]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

#use data from pandas
df=pd.read_csv('merged_df_out.csv')

#use imputer
imputer = SimpleImputer(strategy='mean')
feature_cols = df.columns[5:-2]
df[feature_cols] = imputer.fit_transform(df[feature_cols].values)

X=df.iloc[:, 5:-2]
y=df.iloc[:, -1]

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
# Initialize and train classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=200, learning_rate=0.1, max_depth=3, seed=42, subsample=1.0)
xgb_classifier.fit(X_train, y_train)
# Evaluate classifier
y_pred = xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")
# Cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(xgb_classifier, X, y, cv=kfold, scoring='accuracy')
print(f"CV Accuracy: {cv_results.mean():.2f} (+/- {cv_results.std() * 2:.2f})")

Test Accuracy: 0.80
CV Accuracy: 0.81 (+/- 0.03)


In [9]:
# Define the model
xgb_model = xgb.XGBClassifier()
# Define parameters to search
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0]
}
# Set up the grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, scoring='accuracy', cv=3, verbose=1)
# Fit grid search
best_model = grid_search.fit(X_train, y_train)
# Print best parameters and best score
print(f"Best parameters found: {best_model.best_params_}")
print(f"Best accuracy found: {best_model.best_score_:.2f}") 


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters found: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best accuracy found: 0.79


Random Forest Model

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42,min_samples_leaf=1,min_samples_split=5)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.81


Random Forest Grid Search

In [11]:
# Define the model
RF_model = RandomForestClassifier()
# Define parameters to search
params = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
}
# Set up the grid search
grid_search = GridSearchCV(estimator=RF_model, param_grid=params, scoring='accuracy', cv=3, verbose=1)
# Fit grid search
best_model = grid_search.fit(X_train, y_train)
# Print best parameters and best score
print(f"Best parameters found: {best_model.best_params_}")
print(f"Best accuracy found: {best_model.best_score_:.2f}") 

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best parameters found: {'max_depth': 3, 'n_estimators': 100}
Best accuracy found: 0.80
