In [60]:
# Inference
import os
import sys
import math
import json
import torch
import argparse
import textwrap
import transformers
from peft import PeftModel
from transformers import GenerationConfig, TextStreamer
from llama_attn_replace import replace_llama_attn
from tqdm.notebook import tqdm
import time

PROMPT_DICT = {
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
    "prompt_no_input_llama2": (
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
    "prompt_llama2": "[INST]{instruction}[/INST]"
}

def parse_config():
    parser = argparse.ArgumentParser(description='arg parser')
    parser.add_argument('--file_path', type=str, default="")
    parser.add_argument('--base_model', type=str, default="/data1/pretrained-models/llama-7b-hf")
    parser.add_argument('--cache_dir', type=str, default="./cache")
    parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
    parser.add_argument('--flash_attn', type=bool, default=False, help='')
    parser.add_argument('--temperature', type=float, default=0.6, help='')
    parser.add_argument('--top_p', type=float, default=0.9, help='')
    parser.add_argument('--max_gen_len', type=int, default=512, help='')
    args = parser.parse_args()
    return args

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        datas = json.load(file) 
    return datas
    

def build_generator(
    model, tokenizer, temperature=0.6, top_p=0.9, max_gen_len=4096, use_cache=True
):
    def response(prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        # streamer is for directly visualizing generated result through output, if you just want to inference one result and see it directly, please uncomment the following line
        # streamer = TextStreamer(tokenizer)
        output = model.generate(
            **inputs,
            max_new_tokens=max_gen_len,
            temperature=temperature,
            top_p=top_p,
            use_cache=use_cache,
#             streamer=streamer,
        )
        
        out = tokenizer.decode(output[0], skip_special_tokens=True)
        out = out.split(prompt.lstrip("<s>"))[1].strip()
        return out
    
    input_ids = tokenizer(sent, return_tensors="pt").input_ids.to("cuda")

    return response

# format input to make it align with the format of trainig data (input part)
def input_format(data):
    doi = data['doi']
    paper = data['txt']
    keywords = ','.join(data['keywords'])
    prompt = f"Attached is a detailed scientific paper.\n\n{paper}\n\nYour task is to formulate 10 sophisticated Q&A pairs that delve into the underlying scientific principles and knowledge presented in this paper, focusing specifically on {keywords}. Steer clear of questions that are purely section-specific (e.g., 'What does Figure 5 represent?') or basic or definitional questions (e.g., 'What is XXX?'). Instead, focus on questions that require a deeper understanding of the subject matter, especially those relating to complex chemical compounds (like Al2O3, C2H5OH, TNT). Ensure diversity in your Q&A pairs, avoiding any duplication. Answers should be rich in detail, drawing on specific data, chemical properties, and contextual insights from the paper. Strive for clarity and depth in your responses, aiming to enhance the reader's comprehension of the intricate concepts discussed."
    
    return prompt

def main(args):
    if args.flash_attn:
        replace_llama_attn(inference=True)

    # Set RoPE scaling factor
    config = transformers.AutoConfig.from_pretrained(
        args.base_model,
        cache_dir=args.cache_dir,
    )

    orig_ctx_len = getattr(config, "max_position_embeddings", None)
    if orig_ctx_len and args.context_size > orig_ctx_len:
        scaling_factor = float(math.ceil(args.context_size / orig_ctx_len))
        config.rope_scaling = {"type": "linear", "factor": scaling_factor}

    # Load model and tokenizer
    model = transformers.AutoModelForCausalLM.from_pretrained(
        args.base_model,
        config=config,
        cache_dir=args.cache_dir,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model.resize_token_embeddings(32001)

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        args.base_model,
        cache_dir=args.cache_dir,
        model_max_length=args.context_size if args.context_size > orig_ctx_len else orig_ctx_len,
        padding_side="right",
        use_fast=False,
    )

    if torch.__version__ >= "2" and sys.platform != "win32":
        model = torch.compile(model)
    model.eval()

    respond = build_generator(model, tokenizer, temperature=args.temperature, top_p=args.top_p,
                              max_gen_len=args.max_gen_len, use_cache=True)

    datas = read_json_file(args.file_path)
    prompt_no_input = PROMPT_DICT["prompt_llama2"]
    
    s = time.time()
    res = []
    instruction = "As a material science expert, utilize your expertise to analyze the provided scientific paper."
    for idx, data in enumerate(datas):
        prompt = prompt_no_input.format_map({"instruction": instruction + "\n" + input_format(data)})
        output = respond(prompt=prompt)
        res.append({'doi': data['doi'], 'input': prompt, 'output': output})
        if idx%10 == 0:
            e = time.time()
            print(f'{idx}: {e-s}')
            s = time.time()
    return res

In [61]:
import argparse
args_dict = {'file_path': '/fl/data/acl/original/acl_eval_update.json', 
             'base_model': '/fl/model/vicuna/llama2-7b-12k', 
             'cache_dir': './cache', 
             'context_size': 12888, 
             'flash_attn': True, 
             'temperature': 0.6, 
             'top_p': 0.9, 
             'max_gen_len': 12888}
args = argparse.Namespace(**args_dict)

res = main(args)
res

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0: 36.78651976585388
10: 373.960896730423
20: 351.62992668151855
30: 349.70090532302856
40: 343.12568521499634


Token indices sequence length is longer than the specified maximum sequence length for this model (15416 > 12888). Running this sequence through the model will result in indexing errors


50: 389.2591211795807
60: 361.09788727760315
70: 360.734482049942
80: 1432.6345703601837
90: 351.71143865585327
100: 344.2150914669037
110: 349.33943700790405
120: 366.43300914764404
130: 326.4101474285126
140: 333.6437246799469
150: 328.5180959701538
160: 339.9255440235138
170: 385.0532178878784
180: 322.3320896625519
190: 363.4489061832428


In [69]:
# record orginal inference results
file_path = '/fl/data/acl/original/acl_eval_result.json'
with open(file_path, 'w') as file:
    json.dump(res, file, indent=4)  

### Process original inference results

In [1]:
import json
def read(file_path):
    with open(file_path, 'r') as file:
        datas = json.load(file) 
    return datas

res = read('/fl/data/acl/original/acl_eval_result.json')
res[0].keys(), len(res)

(dict_keys(['doi', 'input', 'output']), 195)

In [38]:
import re
pattern = r"^\d+\.\s"

def process_qa(qa):
    try:
        clean_qa = re.sub(pattern, "", qa).split('\nA:')
        return {'Q': clean_qa[0].strip(), 'A': f'A: {clean_qa[1].strip()}'}
    except:
        return None

def deal_qa(qas):
    qa_sep = list(map(process_qa, qas.split('\n\n')))
    
    q_set = set()
    qa_unique = []    # keep unique Q&As only
    for qa in qa_sep:
        if qa != None:
            q = qa['Q']
            if q not in q_set:
                q_set.add(q)
                qa_unique.append(qa)
    return qa_unique

In [39]:
info = read('/fl/data/acl/original/acl_eval_update.json')
res_doi = {i['doi']: i for i in res}

final = []
for i in info:
    data = res_doi[i['doi']]
    qas = data['output']
    qa_list = deal_qa(qas)
    num_qa = len(qa_list)
    
    i['num_Q&A'] = num_qa
    i['Q&A'] = qa_list
    i['raw_output'] = qas
    final.append(i)

In [40]:
final[0]

{'source': 'C:\\Users\\yixliu1\\Desktop\\txt_output_three_filtered_rep_skipped_240103\\Chemistry, Analytical\\10.1038-nmat1035.txt',
 'doi': '10.1038/nmat1035',
 'abstract': "Molecular-dynamics simulations have recently been used to elucidate the transition with decreasing grain size from a dislocation-based to a grain-boundary-based deformation mechanism in nanocrystalline f.c.c. metals. This transition in the deformation mechanism results in a maximum yield strength at a grain size (the 'strongest size') that depends strongly on the stacking-fault energy, the elastic properties of the metal, and the magnitude of the applied stress. Here, by exploring the role of the stacking-fault energy in this crossover, we elucidate how the size of the extended dislocations nucleated from the grain boundaries affects the mechanical behaviour. Building on the fundamental physics of deformation as exposed by these simulations, we propose a two-dimensional stress-grain size deformation-mechanism map 

In [41]:
process = [i for i in final if i['num_Q&A']==10]
len(process)

184

In [45]:
file_path = '/fl/data/acl/acl_eval_final.json'
with open(file_path, 'w') as file:
    json.dump(process, file, indent=4)  