## Demo code for PCE prediction based on text embeddings

### Code Structure
- Data load
- Neural network
- Load pre-trained model
- Prediction
- Compare to similar recipe data found in dataset

### 1. Define functions for data load

In [1]:
import numpy as np
import os
import pickle

from itertools import combinations

import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import default_collate

from sklearn.decomposition import PCA

class SolarData(Dataset):
    def __init__(self,X1,X2,Y):
        self.X1 = X1
        self.X2 = X2
        self.Y = Y

    def __len__(self):
        return len(self.Y)

    def __getitem__(self,idx):
        x1 = torch.Tensor(self.X1[idx])
        x2 = torch.Tensor(self.X2[idx])
        y = torch.Tensor([self.Y[idx]]).view(1,1)
        return x1,x2,y

class GetData():
    def __init__(self,emb_type):
        self.pref = 'embeddings/'
        self.name_emb = f'NatEgyRecipe_{emb_type}.emb.pkl'
        self.name_embJV = f'NatEgyRecipeJV_{emb_type}.emb.pkl'
        self.name_Y = 'NatEgy_PCE.pkl'

        self.emb = pickle.load(open(os.path.join(self.pref,self.name_emb),'rb'))
        self.embJV = pickle.load(open(os.path.join(self.pref,self.name_embJV),'rb'))
        self.y_PCE = pickle.load(open(os.path.join(self.pref,self.name_Y),'rb'))
        self.devIDs = list(self.y_PCE.keys())

        self.X1 = np.array([self.emb[k] for k in self.devIDs])
        self.X2 = np.array([self.embJV[k] for k in self.devIDs])
        self.Y = np.array([self.y_PCE[k] for k in self.devIDs])
     
class PlatformData():
    def __init__(self,emb_type):
        self.pref = 'embeddings/'
        self.name_emb = f'SolarChemDX_{emb_type}.emb.pkl'
        self.name_embJV = f'SolarChemDX_{emb_type}.emb.pkl'
        self.name_Y = 'SolarChemDX_PCE_Max.pkl'

        self.emb = pickle.load(open(os.path.join(self.pref,self.name_emb),'rb'))
        self.embJV = pickle.load(open(os.path.join(self.pref,self.name_embJV),'rb'))
        self.y_PCE = pickle.load(open(os.path.join(self.pref,self.name_Y),'rb'))
        self.devIDs = list(self.y_PCE.keys())

        self.X1 = np.array([self.emb[k] for k in self.devIDs])
        self.X2 = np.array([self.embJV[k] for k in self.devIDs])
        self.Y = np.array([self.y_PCE[k] for k in self.devIDs])

### 2. Define neural network
- Simple fully connected layer
- ReLU is used as a non-linear activation function

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

    
class FCNN(nn.Module):
    def __init__(self,dim_x,dim_h,n_h):
        super(FCNN,self).__init__()
       
        self.fc1 = nn.Linear(dim_x,dim_h)
        self.f1 = nn.ReLU()

        if n_h > 1:
            self.fcs = nn.ModuleList([nn.Linear(dim_h,dim_h) for _ in range(n_h-1)])
            self.fs = nn.ModuleList([nn.ReLU() for _ in range(n_h-1)])

        self.fc_out = nn.Linear(dim_h,1)
        
    def forward(self,x):

        x = self.f1(self.fc1(x))
        
        if hasattr(self,'fcs'):
            for fc,f in zip(self.fcs,self.fs):
                x = f(fc(x))
                
        out = self.fc_out(x)
        return out

### 3. Define functions to load pre-trained model before predictions
- Load train dataset to define PCA function for input embeddings as well as normalization of PCEs
- Input embedding vector should be converted to low dimensional vector using PCA
- Output of the pre-trained model is normalized with mean/standard deviation of PCE in training dataset

In [3]:
import numpy as np
import pickle
import os
import sys

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim

from sklearn.decomposition import PCA

PerovData = PlatformData('small')
Ntot = len(PerovData.devIDs)

IDs = np.arange(Ntot)
np.random.seed(1)
np.random.shuffle(IDs)

Ntr = int(Ntot*0.8)
Nv = int(Ntot*0.1)

id1 = IDs[:Ntr]
Emb_Rec1 = PerovData.X1[id1]
PCE1 = PerovData.Y[id1]

pca1 = PCA(n_components=512)
PC_Rec1 = pca1.fit_transform(Emb_Rec1)

m = np.mean(PCE1)
s = np.std(PCE1)

dim_x = PC_Rec1.shape[1]
dim_h = 128
n_h = 3

chkpt = pickle.load(open('ChmDX_ValTest_1_Params.FCNN.pkl','rb'))
model = FCNN(dim_x,dim_h,n_h).cuda()
model.load_state_dict(chkpt['state_dict'])

<All keys matched successfully>

### 4. Prediction
- Convert text on recipe to text embedding vector using OpenAI API

In [4]:
import os
import json
from langchain_openai import OpenAIEmbeddings

def GetData(dat):
    inp = []
    for dd in dat:
        if "note" in dd:
            dd['note']['value'] = ''
        inp.append(dd)
    return f'{inp}'

my_api = open('GPT_API_Perov_Juhwan.txt').readlines()[0]
os.environ['OPENAI_API_KEY']=my_api

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

pv_data = json.load(open('testdata/KAIST-SC-00549.json'))
jv_max = np.max([tt['efficiency']['value'] for tt in pv_data['analysis']['JV']])
inp = GetData(pv_data['input'])
emb = np.array(embeddings.embed_query(inp)).reshape(1,-1)

x = pca1.transform(emb)
x = torch.Tensor(x).cuda()
model.eval()
with torch.no_grad():
    p = model(x).detach().cpu().numpy()
    p = m + s*p
    print(f'Predicted PCE: {p[0][0]:.2f}%')

Predicted PCE: 18.84%


### 5. Compare with similar recipe data found in dataset

In [5]:
DevID = [PerovData.devIDs[i] for i in id1]
cos_sim = np.sum(Emb_Rec1*emb,1)
sort_sim = np.argsort(-cos_sim)

for i in sort_sim[:5]:
    print(f'ID in dataset: {DevID[i]}, Score: {cos_sim[i]:.4f}')

ID in dataset: KAIST-SC-00498, Score: 0.9992
ID in dataset: KAIST-SC-00273, Score: 0.9992
ID in dataset: KAIST-SC-00274, Score: 0.9992
ID in dataset: KAIST-SC-00501, Score: 0.9992
ID in dataset: KAIST-SC-00500, Score: 0.9992


In [6]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

MyPrompt = """
You are an PhD-level materials scientist on perovskite solar cell. 
I'll give you two recipe on perovskite solar cell device, and please analyze two recipes by comparing each layer of perovskite solar cell.

Recipe1: {recipe1}
Recipe2: {recipe2}
"""

input_var = ["recipe1","recipe2"]

llm = ChatOpenAI(model='gpt-4o',temperature=0.2)
prompt = PromptTemplate(template=MyPrompt, input_variables=input_var)
output_parser = StrOutputParser()

chain = prompt | llm | output_parser

recipe2 = json.load(open('12/KAIST-SC-00498.json'))['input']
inp2 = GetData(recipe2)

res = chain.invoke({"recipe1":inp,"recipe2":inp2})
print(res)

To analyze and compare the two recipes for perovskite solar cell devices, let's break down each layer and process step by step:

### Transparent Conductive Oxide (TCO)
- **Recipe 1 & Recipe 2**: Both recipes use Indium Tin Oxide (ITO) as the TCO layer. This is a common choice due to its high transparency and good conductivity.

### Electron Transport Layer (ETL)
- **Material**: Both recipes use SnO₂ with a concentration of 5 wt% in DI water.
- **Spin Coating**: Both recipes use a "drop&run" method at 4000 rpm for 30 seconds.
- **Annealing**: Both recipes anneal at 150°C for 30 minutes.

### Perovskite Layer
- **Method**: Both recipes use a 1-step deposition method.
- **Molarity**: Recipe 1 uses a higher molarity (1.8 M) compared to Recipe 2 (1.4 M).
- **A-Site Composition**: 
  - Recipe 1: FA (95%) and MA (5%)
  - Recipe 2: FA (97.5%) and MA (2.5%)
- **X-Site Composition**:
  - Recipe 1: I (95%) and Br (5%)
  - Recipe 2: I (97.5%) and Br (2.5%)
- **Solvent**:
  - Recipe 1: DMF (0.8 ml)