In [3]:
import random, json
import numpy as np
from tqdm import tqdm
from scipy import stats
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     E:\Anaconda3_2024\envs\pytorch\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import os
from openai import AzureOpenAI
os.environ["AZURE_OPENAI_KEY"] = ""
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
# four models available: gpt-35-0125, gpt-4-0613, gpt-4-1106, gpt-4-32k-0613
model = "gpt-4-1106"
client = AzureOpenAI(
    api_key = os.getenv("AZURE_OPENAI_KEY"), 
    api_version = "2024-02-01",
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)
with open('../data/zt_ori_84.txt', 'r') as f1:
    lines = f1.readlines()
    names = [line.strip().split('\t')[0] for line in lines]
    zt_scores = [line.strip().split('\t')[1] for line in lines]
# {"role": "user", "content": "Who were the founders of Microsoft?"}
# {"role": "system", "content": "Assistant is a large language model trained by OpenAI."}
# print(response)
# print(response.model_dump_json(indent=4))
# print(response.choices[0].message.content)

def get_descriptions(prompt_material, file_name):
    '''generate descriptions and save each description as a sentence list in a file.
    :prompt_part param: the first half of the prompt
    :file_name param: the name of the file to be saved'''
    gpt4_descriptions85 = {}
    # prompt = 'Give me one paragragh of general description of the key word thermoelectric.'
    prompt = 'Give me several paragraghs of general description of the key word thermoelectric.'
    response = client.chat.completions.create(model = model, messages = [{"role": "user", "content": prompt}])
    gpt4_descriptions85['thermoelectric'] = sent_tokenize(response.choices[0].message.content)
    for name in tqdm(names):
        prompt =  prompt_material.format(name)
        response = client.chat.completions.create(model = model, messages = [{"role": "user", "content": prompt}])
        gpt4_descriptions85[name] = sent_tokenize(response.choices[0].message.content)
    assert len([key for key in gpt4_descriptions85.keys() if key in names]) == 84
    with open('../data/{}.txt'.format(file_name), 'w') as file:
        file.write(json.dumps(gpt4_descriptions85))
    return gpt4_descriptions85

prompt_material = 'Give me one paragragh of general description of of the chemical compound {}.'
file_name = 'gpt4_descriptions85_sents'
# gpt4_descriptions85_sents = get_descriptions(prompt_material, file_name)

prompt_material = 'Give me several paragraphs of general description of the chemical compound {}.'
file_name = 'gpt4_descriptions85_more_sents'
# gpt4_descriptions85_more_sents = get_descriptions(prompt_material, file_name)

prompt_material = '''Please provide information about the material compound {}, including:
General description: Chemical formula and composition, Crystal structure, Key physical properties (e.g., melting point, density, hardness), Typical synthesis methods, Common applications;
Thermoelectric properties: Seebeck coefficient (S), Electrical conductivity (σ), Thermal conductivity (κ), Figure of merit (ZT), Optimal temperature range for thermoelectric performance, Strategies for enhancing thermoelectric properties (e.g., doping, nanostructuring); 
And please provide a concise summary focusing on the most important aspects of the material compound and its thermoelectric properties.'''
file_name = 'gpt4_descriptions85_detailed'
# gpt4_descriptions85_detailed = get_descriptions(prompt_material, file_name)

In [22]:
# with open('../data/gpt4_descriptions85.txt', 'r') as file:
#     gpt4_descriptions85 = json.loads(file.read())
# gpt4_descriptions85_sents = {}
# for key,value in gpt4_descriptions85.items():
#     gpt4_descriptions85_sents[key] = sent_tokenize(value)   # nltk.tokenize.sent_tokenize(paragragh_string)
# with open('../data/gpt4_descriptions85_sents.txt', 'w') as file:
#     file.write(json.dumps(gpt4_descriptions85_sents))   # each description paragragh segmented into a list of sentences

## average number of description sentences
with open('../data/gpt4_descriptions85_sents.txt', 'r') as file:
    gpt4_descriptions85_sents = json.loads(file.read())
    num_sents = [len(value) for value in gpt4_descriptions85_sents.values() if type(value) == list]
    print(np.mean(num_sents))

with open('../data/gpt4_descriptions85_more_sents.txt', 'r') as file:
    gpt4_descriptions85_more_sents = json.loads(file.read())
    num_sents = [len(value) for value in gpt4_descriptions85_more_sents.values() if type(value) == list]
    print(np.mean(num_sents))

with open('../data/gpt4_descriptions85_detailed.txt', 'r') as file:
    gpt4_descriptions85_detailed = json.loads(file.read())
    num_sents = [len(value) for value in gpt4_descriptions85_detailed.values() if type(value) == list]
    print(np.mean(num_sents))

4.964705882352941
15.435294117647059
25.188235294117646


In [None]:
def evaluation(file_name):
    '''takes file_name as argument and returns the evaluation result spearman correlation.
       :file_name params: context_sents85, rag_description85, gpt4_descriptions85, gpt4_descriptions85_sents'''
    
    file_path = '../data/{}.txt'.format(file_name)
    with open(file_path, 'r') as file:
        dic = json.loads(file.read())
    with open('../data/zt_ori_84.txt', 'r') as f1:
        lines = f1.readlines()
        names = [line.strip().split('\t')[0] for line in lines]
        zt_scores = [line.strip().split('\t')[1] for line in lines]

    model = SentenceTransformer('../outputs/SentMatBERT_MNR')
    if type(dic['thermoelectric']) == list:
        center_embedding = np.mean(model.encode(dic['thermoelectric']), axis=0)
    elif type(dic['thermoelectric']) == str:
        center_embedding = model.encode(dic['thermoelectric'])
        
    cos_sims = []
    for name in tqdm(names):
        if type(dic[name]) == list:
            embedding = np.mean(model.encode(dic[name]), axis=0)
        elif type(dic[name]) == str:
            embedding = model.encode(dic[name])
        cos_sim = 1-cosine(center_embedding, embedding)
        cos_sims.append(cos_sim)
        
    corr, pvalue = stats.spearmanr(cos_sims, zt_scores)
    return corr

# print(evaluation('context_sents85'))   # 0.5924
# print(evaluation('gpt4_descriptions85'))   # 0.4409
print(evaluation('gpt4_descriptions85_sents'))   # 0.4968 (5 sents per material on average, general description)
print(evaluation('gpt4_descriptions85_more_sents'))   # 0.5553 (15 sents per material on average, general description)
print(evaluation('gpt4_descriptions85_detailed'))   # 0.0779 (25 sents per material on average, detailed description)