In [1]:
import json
import numpy as np
import pandas as pd
import string
import re
import collections
from sklearn.metrics import f1_score
import Levenshtein
import os
from tqdm import tqdm


def most_similar_answer(a,answer_set):
    a = a.strip().replace(' ', '')
    if(a in answer_set):
        return a
    dis = [Levenshtein.distance(a,x) for x in answer_set]
    idx = np.argmin(dis)
    return answer_set[idx]

def eval_atom_cnt(task, pred):
    # task = atom count
    return pred==task['output']

def eval_dimensions(task, pred):
    # task = dimensions, pred is a list
    if "lengths of the lattice vectors" in task['input']:
        lengths = [float(x) for x in task['output'].split(',')]
        mse = np.mean(abs(np.array(lengths) - np.array(pred))/np.array(lengths))
        return mse
    else:
        angles = [float(x) for x in task['output'].split(',')]
        mae = np.mean(abs(np.array(angles) - np.array(pred))/np.array(angles))
        return mae

def eval_atom_name(task, pred):
    # task = atom name
    return pred==task['output']

def eval_spacegroup(task, pred):
    # task = space group
    return pred==task['output']

def eval_cell_volume(task, pred):
    # task = cell_volume
    return abs(float(task['output'])-pred)/float(task['output'])

def eval_formula(task, pred):
    # task = formula
    return task['output']==pred

def eval_replace(task, pred):
    # task = replace
    answer = most_similar_answer(pred, ["Yes", "No"])
    return task['output']==answer

def eval_dimensions_sem(task, pred):
    lengths = [float(x) for x in task['output'].split('\n')[0].split()]
    mse = np.mean(abs(np.array(lengths) - np.array(pred[0]))/np.array(lengths))
    angles = [float(x) for x in task['output'].split('\n')[1].split()]
    mae = np.mean(abs(np.array(angles) - np.array(pred[1]))/np.array(angles))

    return mse, mae

def eval_infill_task(task, pred):
    return pred==task['output']

def eval_gen_format(pred):
    def trim_list(l):
        return [x.strip().replace('\n', '') for x in l if x]
    l = trim_list(pred.split('\n'))
    l1 = trim_list(l[0].split())
    l2 = trim_list(l[1].split())
    if len(l1)!=3 or len(l2)!=3:
        return 0
    matrix = l[2:]
    if len(matrix)%2:
        return 0
    for i in range(0, len(matrix), 2):
        l1 = trim_list(matrix[i].split())
        l2 = trim_list(matrix[i+1].split())
        if len(l1)!=1 or len(l2)!=3:
            return 0
        for x in l2:
            try:
                y = float(x)
            except:
                return 0
    return 1



In [2]:
valfile = []
with open("/scratch/cse/btech/cs1200448/MatLlama/ift_cif_large/val.jsonl", 'r') as f:
    valfile = [json.loads(line) for line in f.readlines()]

out_dict = dict()
tasks = ["atom count", "dimensions_synt", "atom name", "replace", "space group", "cell_volume", "formula", "dimensions_sem", "vol_calc"]
# "infill", "formula_compute", "conditional_generation", "element_generation"
for task in tasks:
    out_dict[task] = []

idxs = []
for _, sample in enumerate(valfile):
    task = sample['task']
    system = sample['system']
    idxs.append(_)
    with open(f'/home/cse/btech/cs1200448/MatLlama/cif_infer_outputs_cs/{_}.txt', 'r') as f:
        output = f.read()

    if task in tasks:
        out_dict[task].append([output, sample])
    elif task=="dimensions":
        if "predict" not in system and "forecast" not in system:
            out_dict["dimensions_synt"].append([output, sample])
        else:
            out_dict["dimensions_sem"].append([output, sample])  

In [3]:
len(idxs)

27183

In [4]:
valfile = []
with open("/scratch/cse/btech/cs1200448/MatLlama/ift_cif_large/val.jsonl", 'r') as f:
    valfile = [json.loads(line) for line in f.readlines()]

out_dict = dict()
# pred_dict = dict()

tasks = ["atom count", "dimensions_synt", "atom name", "replace", "space group", "cell_volume", "formula", "dimensions_sem", "vol_calc"]
# "infill", "formula_compute", "conditional_generation", "element_generation"
for task in tasks:
    out_dict[task] = []
    # pred_dict[task] = []
    
for _, sample in enumerate(valfile):
    task = sample['task']
    system = sample['system']

    with open(f'/home/cse/btech/cs1200448/MatLlama/cif_infer_outputs_cs/{_}.txt', 'r') as f:
        output = f.read()

    if task in tasks:
        out_dict[task].append([output, sample])
        # out_dict[task].append([output, sample])
        
    elif task=="dimensions":
        if "predict" not in system and "forecast" not in system:
            out_dict["dimensions_synt"].append([output, sample])
        else:
            out_dict["dimensions_sem"].append([output, sample])  

In [5]:
          

scores = dict()
for task in ['atom count']: #['dimensions_sem']:
    print(task, end=':')
    scores[task] = 0
    if task=="dimensions_sem":
        scores[task] = [0, 0]
    true_cnt = 0 if ("dimensions" in task or "volume" in task) else len(out_dict[task])
    for output, sample in out_dict[task]:
        if task=="atom count":
            output = output.strip().replace('\n', '').replace(' ', '')
            if output.isdigit():
                scores[task] += eval_atom_cnt(sample, int(output))
        if task=="dimensions_synt":
            l = output.split(',')
            l = [x for x in l if x!='']
            l = list(map(lambda x: x.strip().replace('\n', '').replace(' ', ''), l))
            if len(l)==3:
                try:
                    scores[task] += eval_dimensions(sample, [float(l[0]), float(l[1]),float(l[2])])
                    true_cnt += 1
                except:
                    continue
        if task=="atom name":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_atom_name(sample, output)
        if task=="replace":
            scores[task] += eval_replace(sample, output)
        if task=="space group":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_spacegroup(sample, output)
        if task=="cell_volume":
            output = output.strip().replace('\n', '').replace(' ', '')
            try:
                answer = float(output)
                scores[task] += eval_cell_volume(sample, answer)
                true_cnt += 1
            except:
                continue
        if task=="formula":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_formula(sample, output)
        if task=="infill":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_infill_task(sample, output)
        if task=="dimensions_sem":
            l1 = output.split('\n')
            l1 = [x for x in l1 if x!='']
            if len(l1)!=2:
                continue
            l21 = l1[0].split()
            l22 = l1[1].split()
            l21 = [x.strip() for x in l21 if x!='']
            l22 = [x.strip() for x in l22 if x!='']
            if len(l21)==3 and len(l22)==3:
                mse, mae = eval_dimensions_sem(sample, [[float(l21[0]), float(l21[1]),float(l21[2])], [float(l22[0]), float(l22[1]), float(l22[2])]])
                true_cnt += 1
                scores[task][0] += mse
                scores[task][1] += mae
        if task=="vol_calc":
            output = output.strip().replace('\n', '').replace(' ', '')
            try:
                answer = float(output)
                scores[task] += eval_cell_volume(sample, answer)
                true_cnt += 1
            except:
                continue 
        if task=="formula_compute":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_formula(sample, output)
        if "generation" in task:
            scores[task] += eval_gen_format(output)
        
    if "dimensions_sem" not in task:
        scores[task] /= true_cnt
    else:
        scores[task][0] /= true_cnt
        scores[task][1] /= true_cnt

    print(scores[task], true_cnt, len(out_dict[task]))

atom count:0.9980870396939263 2091 2091


In [6]:
actual = [sample[1]['output'] for sample in out_dict[task]]
preds = [sample[0] for sample in out_dict[task]]

In [7]:
# np.unique(actual)

In [8]:
scores, true_cnt

({'atom count': 0.9980870396939263}, 2091)

In [9]:
df_form = pd.DataFrame(preds, columns=['pred_cs'])
df_form['actual'] = actual

In [10]:
df_form.pred_cs.value_counts()

pred_cs
4     287
2     202
12    196
8     175
3     169
6     165
1     135
16    129
10     86
24     80
5      63
20     60
7      58
14     56
18     45
9      36
11     28
15     25
17     24
13     16
19     12
28      9
22      8
23      8
32      5
21      3
        3
27      2
29      2
40      1
26      1
35      1
30      1
Name: count, dtype: int64

In [11]:
df_form

Unnamed: 0,pred_cs,actual
0,8,8
1,24,24
2,27,27
3,12,12
4,8,8
...,...,...
2086,12,12
2087,4,4
2088,8,8
2089,22,22


In [12]:
import os

In [13]:
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [14]:
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer

# tokenizer = LlamaTokenizer.from_pretrained("/home/cse/btech/cs1200389/MatLlama/MatLLaMA/src/tokenizer_l2.model")
# model = LlamaForCausalLM.from_pretrained("/scratch/civil/phd/cez188393/zaki_epcc/checkpoints_llamat_cit/checkpoint_1000_to_hf/")

ckpt = '1000'
# model_dir = f"/scratch/civil/phd/cez188393/zaki_epcc/checkpoints_llamat3_cif/checkpoint_{ckpt}_to_hf/"
model_dir = f"/scratch/civil/phd/cez188393/zaki_epcc/checkpoints_llamat_cit/checkpoint_{ckpt}_to_hf/"
# tokenizer = LlamaTokenizer.from_pretrained(model_dir)
# model = LlamaForCausalLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:13<00:00,  3.46s/it]


In [15]:
model.to("cuda")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [16]:
idx  = 2
print(task)
sample = out_dict[task][idx][1]

atom count


In [17]:
sample['output']

27

In [18]:
sample_nv = [o[1] for o in out_dict[task]]

In [19]:

eval_prompt = f"{sample['system']} "+f"input-{sample['input']}" + "output-"

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")


In [20]:
import torch

In [21]:
# dfch[~mask].head(20)#.value_counts()

In [22]:
# help(model.generate)

In [23]:
generation_config = {
  "bos_token_id": 128010,
  "do_sample": True,
  "eos_token_id": 128011,
  # "max_length": 2048,
  # "temperature": 0.6,
  # "top_p": 0.9,
  # "transformers_version": "4.41.0"
}


In [24]:
model.eval()
with torch.no_grad():

    # for 
    print(tokenizer.decode(model.generate(**model_input, top_p=0.95, max_length=int(model_input['input_ids'].shape[1]+3),do_sample=False)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure. input-Below is a CIF file.
# generated using pymatgen
data_CsRb14In27
_symmetry_space_group_name_H-M   'P 1'
_cell_length_a   10.31018508
_cell_length_b   10.31018504
_cell_length_c   17.97079100
_cell_angle_alpha   90.00000000
_cell_angle_beta   90.00000000
_cell_angle_gamma   119.98585247
_symmetry_Int_Tables_number   1
_chemical_formula_structural   CsRb14In27
_chemical_formula_sum   'Cs1 Rb14 In27'
_cell_volume   1654.59855919
_cell_formula_units_Z   1
loop_
 _symmetry_equiv_pos_site_id
 _symmetry_equiv_pos_as_xyz
  1  'x, y, z'
loop_
 _atom_site_type_symbol
 _atom_site_label
 _atom_site_symmetry_multiplicity
 _atom_site_fract_x
 _atom_site_fract_y
 _atom_site_fract_z
 _atom_site_occupancy
  Cs  Cs0  1  0.999883  0.999883  0.500000  1
  Rb  Rb1  1  0.666700  0.333382  0.000000  1
  Rb  Rb2  1  0.333382  0.666700  0.000000  1
  Rb  Rb3  1  0.610418  0.999940  0.185387  1
  

In [25]:
sample['output']

27

In [26]:
from tqdm import tqdm

In [27]:
task

'atom count'

In [28]:
task

'atom count'

In [29]:
from warnings import  filterwarnings
filterwarnings("ignore")

In [30]:
outputcs = []
for sample in tqdm(sample_nv[:]):
    eval_prompt = f"{sample['system']} "+f"input-{sample['input']}" + "output-"

    model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
    output = model.generate(
            **model_input, 
            top_p=0.95, 
            max_length=model_input['input_ids'].shape[1] + 3, 
        # max_length=model_input['input_ids'].shape[1] + 16, 
            do_sample=False
        )
    generated_tokens = output[0, model_input['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    outputcs.append(generated_text)

  0%|                                                                                                                                                                                        | 0/2091 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|                                                                                                                                                                                | 1/2091 [00:00<29:39,  1.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|▏                                                                                                                                                                               | 2/2091 [00:02<43:43,  1.26s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|▎                                                                                                                                                    

In [50]:
len(outputcs)

2091

In [32]:
# outputcs

In [33]:
len(out_dict[task])

2091

In [34]:
len(outputcs)

2091

In [35]:
'90Title'.replace('Title','')

'90'

In [36]:
import re

def extract_numbers(s):
    # Use regex to find floating-point numbers
    result = re.findall(r'\d+\.\d+|\d+', s)
    # Join the result list into a single string
    return float(''.join(result))

# Example usage
# example_string = l22[2]
# number = extract_numbers(example_string)
# print(number)  # Output: 12.1

In [37]:
task

'atom count'

In [38]:
scores = dict()

if task==task:
    scores[task] = 0
# outputs_cs =[]
# outputs_nv = []

# sample_nv = []
idxss = []
for task in [task]:#['formula']: #["atom name"]:#, "dimensions_synt", "atom name", "replace", "space group", "cell_volume", "formula", "dimensions_sem", "vol_calc"]:
    print(task, end=':')
    scores[task] = 0#[0,0]

    true_cnt = 0 if ("dimensions" in task or "volume" in task) else len(out_dict[task])

    idx = 0
    for output, sample in out_dict[task]:
        idx+=1

        if task=="dimensions_sem":
            l1 = outputcs[idx-1].split('\n') #output.split('\n')
            l1 = [x for x in l1 if x!='']
            if len(l1)!=2:
                continue
            l21 = l1[0].split()
            l22 = l1[1].split()
            l21 = [x.strip() for x in l21 if x!='']
            l22 = [x.strip() for x in l22 if x!='']
            
            if len(l21)==3 and len(l22)==3:
                # mse, mae = eval_dimensions_sem(sample, [[float(l21[0]), float(l21[1]),float(l21[2])], [float(l22[0]), float(l22[1]), float(l22[2].replace('Title',''))]])
                mse, mae = eval_dimensions_sem(sample, [[float(l21[0]), float(l21[1]),float(l21[2])], [float(l22[0]), float(l22[1]), extract_numbers(l22[2])]])
                true_cnt += 1
                scores[task][0] += mse
                scores[task][1] += mae
        if task=="atom count":
            output = outputcs[idx-1].strip().replace('\n', '').replace(' ', '')
            if output.isdigit():
                scores[task] += eval_atom_cnt(sample, int(output))
        if task=="atom name":
        #     # idxs
        #     idxss.append(idxs[idx-1])
        #     # sample_nv.append(sample)

        #     output = outputcs[idx-1].strip().replace('\n', '').replace(' ', '')#output.strip().replace('\n', '').replace(' ', '')
        #     scores[task] += eval_formula(sample, output)
            # outputs_nv.append(output)
            # print(
            # output = output.strip().replace('\n', '').replace(' ', '')
            # scores[task] += eval_atom_name(sample, output)
            # print(output, outputcs[idx-1].strip().replace('\n', '').replace(' ', ''))
            output = outputcs[idx-1].strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_atom_name(sample, output)
            # print(output)
            # print(sample)

atom count:

In [39]:
scores[task]

2085

In [40]:
scores[task]/2091

0.9971305595408895

In [41]:
eval_atom_cnt(sample, int(output))

True

In [37]:
scores[task][0]/2091, scores[task][1]/2091

(0.02144423487769472, 0.02254629804076367)

In [49]:
true_cnt

2091

In [42]:
dfch = pd.DataFrame(outputcs, columns=['pred_cs'])

In [43]:
dfch

Unnamed: 0,pred_cs
0,8
1,24
2,27
3,12
4,8
...,...
2086,12
2087,4
2088,8
2089,22


In [44]:
dfch['actual'] = [sample['output'] for sample in sample_nv]

In [45]:
dfch

Unnamed: 0,pred_cs,actual
0,8,8
1,24,24
2,27,27
3,12,12
4,8,8
...,...,...
2086,12,12
2087,4,4
2088,8,8
2089,22,22


In [46]:
# dfch.astype(float)

In [47]:
task, ckpt

('atom count', '1000')

In [48]:
dfch['input'] = [sample['input'] for output, sample in out_dict[task]]

In [49]:
dfch.to_csv(f'{ckpt}_{task}_llama2.csv',index=None)

In [54]:
dfch['status'] = dfch.pred_cs == dfch.actual
mask = dfch.pred_cs == dfch.actual

In [55]:
dfch[dfch.pred_cs.apply(lambda x: x.startswith('2'))]#.sum()

Unnamed: 0,pred_cs,actual,status
1563,2.9 2.9 11.5\n90 90 120Title,2.9 2.9 12.9\n90 90 120,False


In [63]:
sum(dfch.pred_cs == dfch.actual)/2091

0.13438546150167385

In [62]:
dfch.pred_cs = dfch.pred_cs.apply(lambda x: x.replace(x[-5:],''))

In [248]:
dfch['pred_cs_1k'] = dfch.pred_cs.apply(lambda x:  x.strip().replace('\n', '').replace(' ',''))

In [263]:
mask = dfch2['actual'] == dfch2['pred_cs_1k']

In [None]:
redo = list(dfch[dfch['pred_cs_1k'] == ''].index)

In [249]:
redoo = list(dfch[~mask].index)#.value_counts()

In [250]:
len(redoo)

901

In [None]:
# redo = list(dfch[dfch['pred_cs_1k'] == ''].index)

In [252]:
for idxx in  tqdm(redoo):
    sample = sample_nv[idxx]
    # eval_prompt = f"{sample['system']} "+f"input-{sample['input']}" #+ "output- " #  for atom name
    
    eval_prompt = f"{sample['system']} "+f"input-{sample['input']}" + "output-"
    
    model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
    # output = model.generate(
    #         **model_input, 
    #         top_p=0.95, 
    #         max_length=model_input['input_ids'].shape[1] + 3, 
    #         do_sample=True
    #     )
    output = model.generate(
            **model_input, 
            top_p=0.95, 
            max_length=model_input['input_ids'].shape[1] + 20, 
            do_sample=False
        )
    generated_tokens = output[0, model_input['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    outputcs[idxx] =  generated_text

  0%|                                                                                              | 0/901 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|                                                                                      | 1/901 [00:01<15:16,  1.02s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|▏                                                                                     | 2/901 [00:02<22:35,  1.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|▎                                                                                     | 3/901 [00:04<23:58,  1.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|▍                                                                                     | 4/901 [00:06<24:12,  1.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|▍                                       

In [253]:
dfch2 = pd.DataFrame(outputcs, columns=['pred_cs'])

In [254]:
dfch2['actual'] = [sample['output'] for sample in sample_nv]

In [255]:
dfch2['pred_cs_1k'] = dfch2.pred_cs.apply(lambda x:  x.strip().replace('\n', '').replace(' ',''))

In [256]:
mask = dfch2['actual'] == dfch2['pred_cs_1k']

In [257]:
sum(~mask)

3

In [258]:
# redoo= list(dfch4[dfch4['pred_cs_1k'] == ''].index)

In [261]:
dfch2[~mask]

Unnamed: 0,pred_cs,actual,pred_cs_1k
896,100.00.00.000\n0.09\n0,K3Mo3(PO4)4,100.00.00.0000.090
1192,0\n0\n0.00000000000000,Tb6Al43W4,000.00000000000000
1962,0\n0.0\n0\n0\n0\n000\n0\n00,Ba14TbSb11,00.0000000000


In [260]:
sum(mask)/len(dfch2)

0.9985652797704447

In [303]:
redoo = list(dfch3[~mask].index)#['pred_cs_1k'].value_counts()

In [314]:
1639,  1650, 1758, 1894

(1639, 1650, 1758, 1894)

In [317]:
dfch5[~mask]['pred_cs_1k'].value_counts()

pred_cs_1k
Whatis        70
              57
T             11
R              7
b.             5
H              5
P              5
S              4
a.             3
A              3
Howmany        3
I              3
b)             2
a              2
b              2
Theelement     2
38             1
Coo            1
Howmuch        1
0.             1
Whowas         1
Z              1
20             1
Whatare        1
B              1
A:             1
21             1
Title:         1
Nd             1
Name: count, dtype: int64

In [312]:
len(dfch)-197

1894

In [313]:
import pickle

In [79]:
with open('atom_name_1000.pkl','wb') as f:
    pickle.dump(outputcs, f)
f.close()

In [126]:
sample

{'system': 'Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.',
 'input': "Below is a CIF file.\n# generated using pymatgen\ndata_Er12Tm8CoBi8Ir3\n_symmetry_space_group_name_H-M   'P 1'\n_cell_length_a   7.91070137\n_cell_length_b   8.94366100\n_cell_length_c   12.39808406\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   90.19768215\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   1\n_chemical_formula_structural   Er12Tm8CoBi8Ir3\n_chemical_formula_sum   'Er12 Tm8 Co1 Bi8 Ir3'\n_cell_volume   877.16705339\n_cell_formula_units_Z   1\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Er  Er0  1  0.657359  0.750000  0.699761  1\n  Er  Er1  1  0.338731  0.2500

In [263]:
dfch2.to_csv('formula_cs_1k_r2.csv',index=None)

In [265]:
# dfch2

In [320]:
# dfch[~mask]['pred_cs_1k']#.value_counts()/

In [134]:
scores

{'atom name': 1639}

In [75]:
1639/true_cnt

0.7838354854136776

In [None]:
# outputcs

In [62]:
# Decode only the generated tokens (excluding the input tokens)
generated_tokens = output[0, model_input['input_ids'].shape[1]:]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

# Print the generated text
print(generated_text)




N


In [56]:
generated_text

'\nRh'

In [32]:
# len(outputs_cs),
# len(outputs_nv), 
len(sample_nv)

2091

In [65]:
dfout= pd.DataFrame(data=outputs_cs, columns=['cerebras'])
dfout['nvidia'] = outputs_nv
dfout['idx'] = idxss

In [66]:
dfout

Unnamed: 0,cerebras,nvidia,idx
0,2,N \n,1
1,0.21,O \n,2
2,29,In \n,3
3,7.9,Ni \n,4
4,14,Sb \n,5
...,...,...,...
2086,350233,Dy \n,2087
2087,1.00,O \n,2088
2088,2,Rh \n,2089
2089,1,Ta \n,2090


In [21]:
len(output)

2

In [22]:
true_cnt

2091

In [17]:
2065/true_cnt

0.9875657580105213

In [12]:
output

'Ag'

In [3]:
          

scores = dict()
for task in tasks:
    print(task, end=':')
    scores[task] = 0
    if task=="dimensions_sem":
        scores[task] = [0, 0]
    true_cnt = 0 if ("dimensions" in task or "volume" in task) else len(out_dict[task])
    for output, sample in out_dict[task]:
        if task=="atom count":
            output = output.strip().replace('\n', '').replace(' ', '')
            if output.isdigit():
                scores[task] += eval_atom_cnt(sample, int(output))
        if task=="dimensions_synt":
            l = output.split(',')
            l = [x for x in l if x!='']
            l = list(map(lambda x: x.strip().replace('\n', '').replace(' ', ''), l))
            if len(l)==3:
                try:
                    scores[task] += eval_dimensions(sample, [float(l[0]), float(l[1]),float(l[2])])
                    true_cnt += 1
                except:
                    continue
        if task=="atom name":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_atom_name(sample, output)
        if task=="replace":
            scores[task] += eval_replace(sample, output)
        if task=="space group":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_spacegroup(sample, output)
        if task=="cell_volume":
            output = output.strip().replace('\n', '').replace(' ', '')
            try:
                answer = float(output)
                scores[task] += eval_cell_volume(sample, answer)
                true_cnt += 1
            except:
                continue
        if task=="formula":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_formula(sample, output)
        if task=="infill":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_infill_task(sample, output)
        if task=="dimensions_sem":
            l1 = output.split('\n')
            l1 = [x for x in l1 if x!='']
            if len(l1)!=2:
                continue
            l21 = l1[0].split()
            l22 = l1[1].split()
            l21 = [x.strip() for x in l21 if x!='']
            l22 = [x.strip() for x in l22 if x!='']
            if len(l21)==3 and len(l22)==3:
                mse, mae = eval_dimensions_sem(sample, [[float(l21[0]), float(l21[1]),float(l21[2])], [float(l22[0]), float(l22[1]), float(l22[2])]])
                true_cnt += 1
                scores[task][0] += mse
                scores[task][1] += mae
        if task=="vol_calc":
            output = output.strip().replace('\n', '').replace(' ', '')
            try:
                answer = float(output)
                scores[task] += eval_cell_volume(sample, answer)
                true_cnt += 1
            except:
                continue 
        if task=="formula_compute":
            output = output.strip().replace('\n', '').replace(' ', '')
            scores[task] += eval_formula(sample, output)
        if "generation" in task:
            scores[task] += eval_gen_format(output)
        
    if "dimensions_sem" not in task:
        scores[task] /= true_cnt
    else:
        scores[task][0] /= true_cnt
        scores[task][1] /= true_cnt

    print(scores[task], true_cnt, len(out_dict[task]))



atom count:0.9904351984696318 2091 2091
dimensions_synt:0.00375855521982749 2072 2091
atom name:0.9875657580105213 2091 2091
replace:0.9775227164036346 2091 2091
space group:0.9909134385461502 2091 2091
cell_volume:0.0 2072 2091
formula:0.9899569583931134 2091 2091
dimensions_sem:[0.0385536850168354, 0.03726850029814904] 2090 2091
vol_calc:0.014284558263578882 4182 2091


In [4]:
scores

{'atom count': 0.9904351984696318,
 'dimensions_synt': 0.00375855521982749,
 'atom name': 0.9875657580105213,
 'replace': 0.9775227164036346,
 'space group': 0.9909134385461502,
 'cell_volume': 0.0,
 'formula': 0.9899569583931134,
 'dimensions_sem': [0.0385536850168354, 0.03726850029814904],
 'vol_calc': 0.014284558263578882}

In [5]:
out_dict['atom name']

[['N \n',
  {'system': 'Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.',
   'input': "Below is a CIF file.\n# generated using pymatgen\ndata_Ce2NbMoN4\n_symmetry_space_group_name_H-M   'P 1'\n_cell_length_a   3.27117500\n_cell_length_b   5.64984035\n_cell_length_c   10.83020100\n_cell_angle_alpha   89.96040902\n_cell_angle_beta   90.00000000\n_cell_angle_gamma   90.00000000\n_symmetry_Int_Tables_number   1\n_chemical_formula_structural   Ce2NbMoN4\n_chemical_formula_sum   'Ce4 Nb2 Mo2 N8'\n_cell_volume   200.15957375\n_cell_formula_units_Z   2\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Ce  Ce0  1  0.250000  0.580420  0.855977  1\n  Ce  Ce1  1  0.750000  0.419580  0.144023  1

In [None]:
with open('results.txt', 'w') as f:
    for task in out_dict:
        f.write(task + ": " + str(scores[task]) + '\n')

with open('samples.txt', 'w') as f:
    for task in out_dict:
        f.write(task + ":" + '\n')
        f.write(out_dict[task][0][1]['system']+out_dict[task][0][1]['input'] +'\n')
        f.write("Expected output:" +'\n')
        f.write(str(out_dict[task][0][1]['output']) +'\n')
        f.write("Model output:" +'\n')
        f.write(out_dict[task][0][0] +'\n')