In [30]:
%pip install --upgrade --quiet sagemaker

[0mNote: you may need to restart the kernel to use updated packages.


# 用微调好的模型去部署

In [5]:
# lasson
from sagemaker.jumpstart.estimator import JumpStartEstimator
training_job_name = "jumpstart-london-0712-lasson-llama2-7b-f-ui-2"
model_id = "meta-textgeneration-llama-2-7b-f"

model = JumpStartEstimator.attach(training_job_name, model_id)
# model.logs()


In [18]:
instance_type="ml.g5.2xlarge"
predictor = model.deploy(instance_type=instance_type)

---------!

In [16]:
!pip install nltk sacrebleu rouge

In [34]:
# lasson do
with open("./letter2.txt", "r", encoding="utf-8") as f:
    letter = f.read()
# print(letter)

response = predictor.predict({'inputs': letter,
                             'parameters': {'max_new_tokens': 128}})

print("Output:\n", response[0]["generated_text"].strip(), end="\n\n\n")

Output:
 Role : You are a experienced doctor who have memory of electronic medical records related to many diseases.

Instruction : please extract the referral reason from the following referral letter  separeted by ###. output your result

Rule For referral : this content should be a whole paragraph which tells Patient need referral. If the referral_letter contains this content, you should include it. If the letter doesn't contain related information, then it should be null.

output your result directly

###
Dr. Anthony Smith
Ophthalmology Department
Springfield Eye Clinic
123 Health Street
Springfield, XY 78910

October 12, 2023

Dear Dr. Smith,

I am writing to refer Ms. Jane Doe, a 52-year-old female, for further evaluation of suspected Papilledema.

Patient Information:
Name: Ms. Jane Doe
Age: 52
Medical History: Hypertension, Type 2 Diabetes
Visual Acuity: Right Eye: 20/30, Left Eye: 20/25

Referral Reason: During a routine eye examination, I observed swollen discs and indistinct

In [20]:
import sacrebleu
from rouge import Rouge

import json
import pandas as pd


def evaluate_jsonl_with_llama2(predictor, path, output_jsonl_path, csv_file_path):
    test_data_json = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            test_data_json.append(json.loads(line.strip()))
    rouge_score_list = []
    bleu_score_list = []

    rouge = Rouge()

    evaluate_list = []

    for single_test in test_data_json:
        instruction = single_test["instruction"]
        whole_letter = single_test["whole_letter"]
        referral_content = single_test["referral_content"]
        prompt = f"{instruction}\n\n###\n\n{whole_letter}\n\n###"
        response = predictor.predict({'inputs': prompt,
                                 'parameters': {'max_new_tokens': 256}})
        # print(prompt)
        reference_text = referral_content
        try:
            tmp = json.loads(response[0]["generated_text"].strip())
            candidate_text = tmp["referral_content"]
        except Exception as err:
            print(single_test["id"])
            print(response[0]["generated_text"].strip())
            print()
            candidate_text = "extract failure"
        finally:

            evaluate_list.append(candidate_text)
            # print("predict: " + candidate_text)
            # print("real: " + reference_text)


            bleu = sacrebleu.corpus_bleu([candidate_text], [[reference_text]])
            bleu_score_list.append(bleu.score)
            # print(bleu.score)
            single_test["bleu"] = bleu.score
            single_test["predict_referral_content"] = candidate_text
            # print()
            # 计算ROUGE分数
            scores = rouge.get_scores(candidate_text, reference_text) # 由于只有一个 reference，所以 avg没有影响
            rouge_score_list.append(scores)
            
#     with open(output_jsonl_path, mode='w', encoding='utf-8') as f:
#         for single_test in test_data_json:
#             f.write(json.dumps(single_test, ensure_ascii=False) + '\n')

#     print(f"predicted data has been saved to {output_path}.")
    
    # 创建 CSV 文件
    csv_data = []

    for single_test in test_data_json:
        csv_data.append({
            "id": single_test["id"],
            "name": single_test["name"],
            "instruction": single_test["instruction"],
            "whole_letter": single_test["whole_letter"],
            "referral_content": single_test["referral_content"],
            "predict_referral_content": single_test["predict_referral_content"],
            "bleu": single_test["bleu"],
        })

    # 创建 DataFrame
    df = pd.DataFrame(csv_data)

    # 保存为 CSV 文件
    df.to_csv(csv_file_path, index=False, encoding='utf-8')

    print(f"CSV file has been saved to {csv_file_path}")
    
    return evaluate_list, bleu_score_list, rouge_score_list

In [41]:
import pandas as pd

test_data_json = []

with open("./test_dir/test.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        test_data_json.append(json.loads(line.strip()))

# 创建 CSV 文件
csv_data = []
for i, single_test in enumerate(test_data_json):
    single_test["bleu"] = test_bleu_score_list[i]
    single_test["predict_referral_content"] = test_evaluate_list[i]
    
    csv_data.append({
        "id": single_test["id"],
        "name": single_test["name"],
        "instruction": single_test["instruction"],
        "whole_letter": single_test["whole_letter"],
        "referral_content": single_test["referral_content"],
        "predict_referral_content": single_test["predict_referral_content"],
        "bleu": single_test["bleu"],
    })

# 创建 DataFrame
df = pd.DataFrame(csv_data)

# 保存为 CSV 文件
csv_file_path = "./test_dir/predict_0712_finetuned_llama2_7b_f_test.csv"
df.to_csv(csv_file_path, index=False, encoding='utf-8')

print(f"CSV file has been saved to {csv_file_path}")
    

CSV file has been saved to ./test_dir/predict_0712_finetuned_llama2_7b_f_test.csv


In [21]:
test_evaluate_list, test_bleu_score_list,test_rouge_score_list = evaluate_jsonl_with_llama2(predictor, "./test_dir/test.jsonl")

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [22]:
print(test_evaluate_list)

['extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure', 'extract failure']


In [35]:
predictor.delete_endpoint()

# 用原始llama2-7b-f去部署

In [33]:
from sagemaker.jumpstart.model import JumpStartModel

pretrain_model_id, pretrain_model_version = "meta-textgeneration-llama-2-7b-f", "3.*"

pretrain_model = JumpStartModel(model_id=pretrain_model_id, model_version=pretrain_model_version, instance_type="ml.g5.2xlarge" )

pretrain_predictor = pretrain_model.deploy(accept_eula=True)

ImportError: cannot import name 'HubContentType' from 'sagemaker.jumpstart.types' (/opt/conda/lib/python3.10/site-packages/sagemaker/jumpstart/types.py)