# Battery Electrolyte Formulation 

![image.png](attachment:b5252d9f-88f1-4bfa-8443-1e509ec4e19a.png)

### Import libraries

In [1]:
import sys
sys.path.append("../models")
sys.path.append("../")

In [2]:
import models.fm4m as fm4m
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

### Load data 

![image.png](attachment:a4a1be5e-081b-48c5-96d6-d02699a0780f.png)

In [3]:
train_df  = pd.read_csv(f"../data/lce/train.csv").dropna()
test_df  = pd.read_csv(f"../data/lce/test.csv").dropna()

In [4]:
train_df.head()

Unnamed: 0,smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
0,C1COC(=O)O1,0.327,O=C(OCC)OCC,0.594,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.079,O,0.0,O,0.0,O,0.0,1.155
1,C1COC(=O)O1,0.356,COC(=O)OC,0.566,FC(F)(F)COB(OCC(F)(F)F)OCC(F)(F)F,0.007,[Li+].F[P-](F)(F)(F)(F)F,0.072,O,0.0,O,0.0,1.046
2,O=S1(=O)CCCC1,0.25,FS([N-]S(F)(=O)=O)(=O)=O.[Li+],0.75,O,0.0,O,0.0,O,0.0,O,0.0,1.569
3,C1COC(=O)O1,0.331,O=C(OCC)OCC,0.577,[Li+].F[P-](F)(F)(F)(F)F,0.092,O,0.0,O,0.0,O,0.0,0.886
4,COCCOC,0.763,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.237,O,0.0,O,0.0,O,0.0,O,0.0,1.367


In [5]:
# Make a list of smiles
train_smiles_list = pd.concat([train_df[f'smi{i}'] for i in range(1, 7)]).unique().tolist()
test_smiles_list = pd.concat([test_df[f'smi{i}'] for i in range(1, 7)]).unique().tolist()

### List of available models

In [6]:
fm4m.avail_models()

Unnamed: 0,Model Name,Description
0,SMI-TED,SMILES based encoder decoder model
1,SELFIES-TED,BART model for string based SELFIES modality
2,Molformer,MolFormer model for string based SMILES modality
3,MHG-GNN,Molecular hypergraph model


### Get embeddings 

![image.png](attachment:b3fdaf5f-f7d9-4253-a19d-07576c3db702.png)

In [7]:
model_type = "SELFIES-TED"
train_emb, test_emb = fm4m.get_representation(train_smiles_list,test_smiles_list, model_type, return_tensor=False)

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Cannot encode [LI+].F[B-](F)(F)OC(C(F)(F)(F))(C(F)(F)(F))C(F)(F)(F) to selfies and embedding replaced by NaN
Cannot encode [Li+].F[P-](F)(F)(F)(F)F to selfies and embedding replaced by NaN


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Cannot encode [Li+].F[P-](F)(F)(F)(F)F to selfies and embedding replaced by NaN
Cannot encode CSi(C)(C)([N+]).C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F to selfies and embedding replaced by NaN


In [8]:
train_emb = [np.nan if row.isna().all() else row.dropna().tolist() for _, row in train_emb.iterrows()]
test_emb = [np.nan if row.isna().all() else row.dropna().tolist() for _, row in test_emb.iterrows()]

In [9]:
train_dict = dict(zip(train_smiles_list, train_emb))
test_dict = dict(zip(test_smiles_list, test_emb))

In [10]:
def replace_with_list(value, my_dict):
    return my_dict.get(value, value)

In [11]:
# Replace the smiles string with its embeddings
df_train_emb = train_df.applymap(lambda x: replace_with_list(x, train_dict))
df_test_emb = test_df.applymap(lambda x: replace_with_list(x, test_dict))

In [12]:
# Drop rows with NaN and reset index
df_train_emb = df_train_emb.dropna().reset_index(drop=True)
df_test_emb = df_test_emb.dropna().reset_index(drop=True)

In [13]:
# Display the first few rows of train data
df_train_emb.head()

Unnamed: 0,smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
0,"[1.0849331617355347, 0.1931672841310501, 0.670...",0.327,"[0.7829928994178772, -0.09458523243665695, 0.1...",0.594,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.079,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,1.155
1,"[0.895878255367279, 0.5856225490570068, 0.3563...",0.25,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.75,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,1.569
2,"[1.1227184534072876, 0.05302262306213379, 0.28...",0.763,"[-0.24963371455669403, 0.28777679800987244, 0....",0.237,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,1.367
3,"[1.1227184534072876, 0.05302262306213379, 0.28...",0.2,"[0.3811344802379608, 0.2315860092639923, -0.02...",0.6,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.2,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,2.301
4,"[0.8845665454864502, 0.3425617814064026, 0.212...",0.873,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.127,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,1.489


In [14]:
# Display the first few rows of test data
df_test_emb.head()

Unnamed: 0,smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,LCE
0,"[0.8845665454864502, 0.3425617814064026, 0.212...",0.733,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.267,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,1.629
1,"[1.1299402713775635, 0.20541700720787048, 0.44...",0.299,"[0.3025802671909332, 0.26763051748275757, 0.13...",0.598,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.103,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,2.056
2,"[1.1227184534072876, 0.05302262306213379, 0.28...",0.507,"[0.4529895782470703, 0.3391157388687134, 0.015...",0.399,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.095,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,2.268
3,"[1.0849331617355347, 0.1931672841310501, 0.670...",0.425,"[0.5938773155212402, 0.1412597894668579, 0.010...",0.481,"[0.3032549023628235, 0.49088549613952637, 0.53...",0.094,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,1.602
4,"[0.895878255367279, 0.5856225490570068, 0.3563...",0.359,"[0.1744002252817154, 0.2921236753463745, 0.105...",0.504,"[-0.2496337890625, 0.28777679800987244, 0.4349...",0.133,"[-0.2958524525165558, 0.6863532066345215, 0.73...",0.004,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,"[0.9481077194213867, -0.008840720169246197, 0....",0.0,2.0


### Construct feature vector for the formulation

![image.png](attachment:770a77a4-f5de-4ba2-b34b-49d8a2e5fdd5.png)

In [15]:
# Construct feature vector by scaling the representation by their corresponding composition and add
def build_feature_vector(df, smi_cols, conc_cols):
    components = [df[smi].apply(pd.Series).mul(df[conc], axis=0) for smi, conc in zip(smi_cols, conc_cols)]
    return sum(components)

In [16]:
# List of columns to process
smi_cols = [f'smi{i}' for i in range(1, 7)]
conc_cols = [f'conc{i}' for i in range(1, 7)]

In [17]:
# Train data processing
x_train = build_feature_vector(df_train_emb, smi_cols, conc_cols)
y_train = pd.DataFrame(df_train_emb["LCE"], columns=["LCE"])

In [18]:
# Test data processing
X_test = build_feature_vector(df_test_emb, smi_cols, conc_cols)
y_test = pd.DataFrame(df_test_emb["LCE"], columns=["LCE"])

### Downstream Model : SVR Regressor

In [19]:
regressor = SVR(kernel="rbf", degree=3, C=5, gamma="scale", epsilon=0.01)
model = TransformedTargetRegressor(regressor=regressor,
                                   transformer=MinMaxScaler(feature_range=(-1, 1))
                                   ).fit(x_train, y_train)

y_prob = model.predict(X_test)
RMSE_score = mean_squared_error(y_test, y_prob, squared=False)
print(f"RMSE score : {RMSE_score}" )

RMSE score : 0.27618728218690297
