-----------------------------------------------------------------------------

<h1> MOFs CO2 working capacity prediction with XGBoost </h1>

------------------------------------------------------------

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow.keras.backend as KBack
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

<h2>loading dataset</h2>

In [None]:
df = pd.read_csv('Datasets/cif_xyz_nonfg_train.csv');df.head(3)

seperate label column

In [None]:
df_label = df['CO2_working_capacity']
df = df.drop(['CO2_working_capacity','MOFname'],axis=1)
print(df.shape,' : ',df_label.shape);df.head(5)

Normalizing functional groups

In [None]:
def check_in(pattern : str, loop : list or np.ndarray or None = df.functional_groups):
    return [int(pattern in str(x).split('-') )for x in loop ]

def count_in(pattern : str, loop : list or np.ndarray or None = df.functional_groups):
    return [str(x).count(pattern) for x in loop]

compounds = set(['SO3H','COOH','NH2','OH','CN','F','OMe','NHMe','NO2','Pr','Cl','OEt','Ph','Br','OPr','HCO','Et','Me','H','I'])
molecules = set(['N','O','C'])

func_data = {
    f'funccheck_{compound}':check_in(compound)
    for compound in compounds
}
func_data.update({
    f'funccount_{molecule}':count_in(molecule)
    for molecule in molecules
})
func_data.update({
    'num_func': [int('-' in str(x)) for x in df.functional_groups]
})

df.functional_groups = df.functional_groups.astype("category").cat.codes
func_df = pd.DataFrame(func_data);func_df


Add more feature

In [None]:
df.insert(
    loc=0,
    column='difatom',
    value=df['sumatom']-df['C+O+H']
)
df.insert(
    loc=0,
    column='surface_to_volume',
    value=df['surface_area']/df['volume']
)
df.insert(
    loc=0,
    column='cubic_surface_area',
    value=((df['volume']**(1/3))**2)*6
)
df

One-hotting metal/organic linkers and topology

In [None]:
metal_linker_int = df['metal_linker']-1
metal_one_hot = to_categorical(metal_linker_int,num_classes=12,dtype='int8')
metal_onehot_df = pd.DataFrame(metal_one_hot,columns=['ml_' + str(num) for num in range(1,13)])
print(metal_onehot_df.shape)
display(metal_onehot_df.head(2))
#---------------------------------------
org1_int = df['organic_linker1']-1
org1_one_hot = to_categorical(org1_int,num_classes=59,dtype='int8')
org1_onehot_df = pd.DataFrame(org1_one_hot,columns=['ol1_' + str(num) for num in range(1,60)])
print(org1_onehot_df.shape)
display(org1_onehot_df.head(2))
#--------------------------------------
org2_int = df['organic_linker2']-1
org2_one_hot = to_categorical(org2_int,num_classes=59,dtype='int8')
org2_onehot_df = pd.DataFrame(org2_one_hot,columns=['ol2_' + str(num) for num in range(1,60)])
print(org2_onehot_df.shape)
display(org2_onehot_df.head(2))
#--------------------------------------
top_int = df['topology']
top_one_hot = to_categorical(top_int,dtype='int8')
top_onehot_df = pd.DataFrame(top_one_hot,columns=['top_' + str(num) for num in range(0,11)])
print(top_onehot_df.shape)
display(top_onehot_df.head(2))

In [None]:
def onehot_inplace(
    field : str,
    one_hot : pd.DataFrame,
    dataframe : pd.DataFrame or None = df):
        dataframe[field] = dataframe[field].astype(object)
        dataframe[field] = list(map(lambda x: np.array(x) ,np.array(one_hot)))
    

In [None]:
onehot_inplace(field = 'metal_linker',one_hot=metal_onehot_df)
onehot_inplace(field = 'organic_linker1',one_hot=org1_onehot_df)
onehot_inplace(field = 'organic_linker2',one_hot=org2_onehot_df)
onehot_inplace(field = 'topology',one_hot=top_onehot_df)
df

<h1>Feature selection</h1>

In [None]:
df_col = dict(
    df_geometry_col = [
    'surface_area',
    'void_fraction',
    'density',
    'void_volume',
    'weight',
    'volume',
    ],
    df_function_col = [
    'functional_groups',
    'metal_linker',
    'organic_linker1',
    'organic_linker2',
    'topology',
    'CO2/N2_selectivity',
    'heat_adsorption_CO2_P0.15bar_T298K',
    ],
    df_cif_col = [
    'cell_length_a',
    'cell_length_b',
    'cell_length_c',
    'cell_angle_alpha',
    'cell_angle_beta',
    'cell_angle_gamma',
    'sum_charge',
    # 'mean_charge',
    ],
    df_xyz_col = [
    'Lattice1',
    # 'Lattice2',
    # 'Lattice3',
    'Lattice4',
    'Lattice5',
    # 'Lattice6', 
    'Lattice7',
    'Lattice8',
    'Lattice9',
    # 'C+O+H',
    'C',
    'O',
    'H',
    'sumatom',
    ],
    df_add_col = [
    'surface_to_volume',
    'difatom',
    'cubic_surface_area'
    ]
)

df_selected = df[sum(df_col.values(),[])]
print(df_selected.shape)
df_selected = pd.concat([
    df_selected,
    # metal_onehot_df,
    # org1_onehot_df,
    # org2_onehot_df,
    # top_onehot_df,
    func_df
    ],axis=1)
print(df_selected.shape);df_selected.head()

<h2>convert to array</h2>

In [None]:
scaler = StandardScaler()

onehotted_columns = ['topology','metal_linker','organic_linker1','organic_linker2']
raw_columns = [x for x in list(df_selected.columns) if x not in onehotted_columns]

df_selected[raw_columns] = scaler.fit_transform(df_selected[raw_columns])
df_selected.head()

In [None]:
df_selected = df_selected.astype(object);df_selected.dtypes

In [None]:
ij = df_selected.shape
for i in range(0,ij[0]):
    for j in range(0,ij[1]):
        df_selected.iat[i,j] = KBack.constant(df_selected.iat[i,j])
df_selected.head()

In [None]:
import joblib
df_selected = joblib.load('df_tensor.pkl')
df_selected

In [None]:
df_label_array = np.array(df_label)
df_array = np.array(df_selected)
f'{df_label_array.shape} : {df_array.shape}'

In [None]:
df_tensor = KBack.constant(df_array)

<h2>train-test split</h2>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_array,df_label_array,test_size=0.1,random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.15,random_state=123)
print(f'train : {X_train.shape}\nval : {X_val.shape}\ntest : {X_test.shape}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_array,df_label_array,test_size=0.1,random_state=123)
print(f'train : {X_train.shape}\ntest : {X_test.shape}')

<h1> Model training </h1>

In [None]:
model = Sequential([
    Dense(64, input_shape=(198,),activation='relu'),
    Dropout(0.3),
    Dense(128,activation='relu'),
    Dropout(0.3),
    Dense(256,activation='relu'),
    Dropout(0.3),
    Dense(512,activation='relu'),
    Dropout(0.3),
    Dense(1),
])

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss='mae',
    metrics=['mae'],
    )

In [None]:
model.fit(
    X_train,
    y_train,
    epochs=500,
    batch_size= 1000,
    )

In [None]:
pred = model.predict(X_test);pred = pred.reshape(6859);np.log10(mean_absolute_error(y_test,pred))

In [None]:
pred_df = pd.DataFrame([pred,y_test]).T;pred_df

In [None]:
%matplotlib inline
plt.scatter(pred_df[0],pred_df[1]-(pred_df[0]));plt.show()

In [None]:
plt.scatter(pred_df[0],pred_df[1]);plt.show()

<h1>Comparing</h1>

In [None]:
pred_df_plot = pred_df[pred_df[0]>400]
tomuch = len(pred_df_plot[(pred_df_plot[1]-pred_df_plot[0]).astype(int)<0])
tolittle = len(pred_df_plot[(pred_df_plot[1]-pred_df_plot[0]).astype(int)>0])
equal = len(pred_df_plot[(pred_df_plot[1]-pred_df_plot[0]).astype(int)==0])
plt.bar(['tomuch','tolittle','equal'],[tomuch,tolittle,equal])

<h1>Evaluation</h1>

In [None]:
testset = pd.read_csv('Datasets/cif_xyz_nonfg_test.csv')
print(testset.shape);testset.head()

In [None]:
test_func_data = {
    f'funccheck_{compound}':check_in(compound,loop=testset.functional_groups)
    for compound in compounds
}
test_func_data.update({
    f'funccount_{molecule}':count_in(molecule,loop=testset.functional_groups)
    for molecule in molecules
})
test_func_data.update({
    'num_func': [int('-' in str(x)) for x in testset.functional_groups]
})
testset.functional_groups = testset.functional_groups.astype("category").cat.codes
test_func_df = pd.DataFrame(test_func_data);test_func_df.head(5)

In [None]:
test_metal_linker_int = testset['metal_linker']-1
test_metal_one_hot = to_categorical(test_metal_linker_int,num_classes=12,dtype='int8')
test_metal_onehot_df = pd.DataFrame(test_metal_one_hot,columns=['ml_' + str(num) for num in range(1,13)])
display(test_metal_onehot_df.head(3))
#-------------------------------------------------
test_org1_int = testset['organic_linker1']-1
test_org1_one_hot = to_categorical(test_org1_int,num_classes=59,dtype='int8')
test_org1_onehot_df = pd.DataFrame(test_org1_one_hot,columns=['ol1_' + str(num) for num in range(1,60)])
display(test_org1_onehot_df.head(3))
#-------------------------------------------------
test_org2_int = testset['organic_linker2']-1
test_org2_one_hot = to_categorical(test_org2_int,num_classes=59,dtype='int8')
test_org2_onehot_df = pd.DataFrame(test_org2_one_hot,columns=['ol2_' + str(num) for num in range(1,60)])
display(test_org2_onehot_df.head(3))
#-------------------------------------------------
test_top_int = testset['topology']
test_top_one_hot = to_categorical(test_top_int,dtype='int8')
test_top_onehot_df = pd.DataFrame(test_top_one_hot,columns=['top_' + str(num) for num in range(0,11)])
display(test_top_onehot_df.head(3))

In [None]:
testset.insert(
    loc=33,
    column='difatom',
    value=testset['sumatom']-testset['C+O+H']
)
testset.insert(
    loc=0,
    column='surface_to_volume',
    value=testset['surface_area']/testset['volume']
)
testset.insert(
    loc=0,
    column='cubic_surface_area',
    value=((testset['volume']**(1/3))**2)*6
)
testset.void_volume = testset.void_volume**2
testset.surface_area = (testset.surface_area**2)

In [None]:
testset = testset[sum(df_col.values(),[])]
testset = pd.concat([testset,
test_metal_onehot_df,
test_org1_onehot_df,
test_org2_onehot_df,
test_top_onehot_df,
test_func_df
],axis=1);testset.head(3)

check validation

In [None]:
print(f'{len(testset.columns)} : {len(df_selected.columns)}')
if(len(testset.columns)==len(df_selected.columns)):
    print(all(testset.columns == df_selected.columns))
    print(all(testset.dtypes == df_selected.dtypes))
else:print(False)

In [None]:
testset_array = np.array(testset);testset.shape

predict

In [None]:
test_pred = xg_reg1.predict(testset_array);test_pred

convert to dataframe

In [None]:
submission = pd.DataFrame({
    "id": [str(i) for i in range(68614,85614)],
    "CO2_working_capacity [mL/g]":test_pred
    })
submission.head()

save csv

In [None]:
submission.to_csv('submission.csv',index=False,float_format='%.7f')

--------------------------------------------------------------------------------------------