In [1]:
def get_generated_tokens(completion):
    return [obj.token for obj in completion.choices[0].logprobs.content]

def split_lines(tokens):
    token_split = []
    cur_split = []

    for token in tokens:
        cur_split.append(token)
        if '\n' in token:
            token_split.append(cur_split)
            cur_split = []

    # Append any remaining tokens that did not end with a newline
    if cur_split:
        token_split.append(cur_split)

    # print("Total tokens after split:", sum(len(split) for split in token_split))

    # Create list of lines by joining tokens and removing newlines
    line_split = [''.join(split).replace('\n', '') for split in token_split]

    return line_split, token_split

In [2]:
def token_indices_by_line(tokens):
    line_split, token_split = split_lines(tokens)
    token_indices_split =[]
    for i, split in enumerate(token_split):
        # print(token_split[i])
        start_index = sum(len(token_split[j]) for j in range(i))
        token_indices = [k for k in range(start_index, start_index + len(split))]
        token_indices_split.append(token_indices)
    return token_indices_split


def code_token_indices_by_line(tokens, lang="python"):
    """
    extract the code lines from the tokens, output the token index of each line of code:
    suppose there are two code lines, the first consists of token indexed 3 to 7, the second consists of token indexed 8 to 10.
    The following list will be returned:
    [[3,4,5,6,7],[8,9,10]]

    """
    token_indices_split = token_indices_by_line(tokens)
    line_split, token_split = split_lines(tokens)
    code_flag = False
    code=""
    prefix = "```"+lang
    code_token_indices_split = []
    for i, line in enumerate(line_split):
        if line.startswith(prefix):
            code_flag=True
        elif line.startswith("```"):
            code_flag=False
            break
        elif code_flag==True:
            code_token_indices_split.append(token_indices_split[i])
    return code_token_indices_split

In [3]:
import numpy as np
from scipy.stats import entropy as cal_entropy


def get_token_logprobs(completion):
    token_logprobs = []
    for token_logprob_obj in completion.choices[0].logprobs.content:
        token_logprobs.append((token_logprob_obj.token, token_logprob_obj.logprob))
    return token_logprobs

def get_sequence_logprob(completion):
    token_logprobs= get_token_logprobs(completion)
    return np.sum([prob for (_, prob) in token_logprobs])


def get_token_top_logprobs(completion):
    """
    return a list of tuples of (token, top_logprobs), indicating the primary token and the log-probs for its top choices.
    example tuple:
    ('```', [('```', -0.28346914), ('Here', -1.8964262), ('Below', -2.5428724), ('The', -4.9661055), ('Certainly', -5.1264677)])
    """
    token_top_logprobs = []
    for token_logprob_obj in completion.choices[0].logprobs.content:
        primary_token = token_logprob_obj.token
        top_logprobs = [(top_logprob_obj.token, top_logprob_obj.logprob) for top_logprob_obj in token_logprob_obj.top_logprobs]
        token_top_logprobs.append((primary_token, top_logprobs))
    return token_top_logprobs


def normalize_probs(probs):
    prob_factor = 1 / sum(probs)
    if isinstance(probs, list):
        return [prob_factor * p for p in probs]
    else:
        return prob_factor * probs


def print_token_probs(completion):
    token_logprobs = get_token_logprobs(completion)
    token_top_logprobs = get_token_top_logprobs(completion)
    print(token_top_logprobs)
    max_len = np.max([len(token) for (token, _) in token_logprobs])
    print("="*25)
    for (token, logprob), token_top_logprob_tup in zip(token_logprobs, token_top_logprobs):
        token_top_logprob = [(tup[0], round(np.exp(tup[1]),6)) for tup in token_top_logprob_tup[1]]
        print(f"| %12s | %.6f | %s"%(repr(token), np.exp(logprob), str(token_top_logprob)))
    print("="*25)

### MTE

In [4]:
def get_token_entropies(completion):
    token_top_logprobs = get_token_top_logprobs(completion)
    token_entropies = []
    for (token, top_logprobs) in token_top_logprobs:
        probs = [np.exp(top_logprob[1]) for top_logprob in top_logprobs]
        probs_nm = normalize_probs(probs)
        entropy = cal_entropy(probs_nm)
        token_entropies.append((token, entropy))
    return token_entropies


def cal_MTE_global(completion, print_all=False):
    token_entropies = get_token_entropies(completion)
    # mean_token_entropy = np.mean(list(token_entropies.values()))
    # Extract only the entropy values from the list of tuples
    entropy_values = [entropy for token, entropy in token_entropies]
    mean_token_entropy = np.mean(entropy_values)  # Calculate the mean of the entropy values

    if print_all:
        # print(f"<token entropy>")
        # print("="*25)
        # for token, entropy in token_entropies:
        #     print(f"| %12s | %.6f"%(token, entropy))
        # print("="*25)
        print(f"Mean Token Entropy: %.6f"%(mean_token_entropy))
    return mean_token_entropy

def cal_MTE(completion, print_all=False):
    tokens = get_generated_tokens(completion)
    token_entropies = get_token_entropies(completion)
    token_indices_split = token_indices_by_line(tokens)

    lines_info = []
    for line_indices in token_indices_split:
        if not line_indices:  # Skip empty lines
            continue

        line_tokens = [tokens[i] for i in line_indices]
        line = ''.join(line_tokens)
        token_entropies_line = []
        for index in line_indices:
            token_entropies_line.append(token_entropies[index][1])
        mte_score = np.mean(token_entropies_line)
        lines_info.append((line, mte_score))

    if print_all:
        print(f"<Mean Token Entropy by line>")
        print("="*25)
        max_length = max(len(repr(line)) for line, _ in lines_info)
        for (line, line_entropy) in lines_info:
            print(f"| {repr(line):<{max_length}} | {line_entropy:.6f}")
        print("="*25)
    return lines_info


### MSP

In [5]:
def cal_MSP_global(completion, print_all=False):
    """
    Calculate the Maximum Softmax Probability (MSP) uncertainty score for the completion.
    """
    tokens = get_generated_tokens(completion)
    token_logprobs = get_token_logprobs(completion)
    token_logprobs = [logprob for _, logprob in token_logprobs]

    token_logprobs = np.array(token_logprobs, dtype=float)  # Convert to numpy array of type float
  
    all_token_logprobs = np.sum(token_logprobs) 
    all_token_prob = np.exp(all_token_logprobs)
    msp_score = 1 - all_token_prob

    return msp_score

def cal_MSP(completion, print_all=False):
    """
    Calculate the Maximum Softmax Probability (MSP) uncertainty score for each line of the completion.

    Steps:
    1. For each line, calculate the MSP uncertainty score.
    Suppose the prompt is x and the response y contains M lines, y(1), y(2), ... y(M), then the MSP uncertainty scores are represented as follows:
    MSP(y(m)|y(<m),x,) = 1 - P(y(m)|x,)
    P(y(m)|x,) = prod(l=1 to L) P(yl(m)|y<l(m),y(<m),x,), where L is the length of y(m)

    Args:
    completion: The completion object containing logprobs and content
    print_all: Boolean flag to print intermediate results

    Returns:
    A list of length M that contains the MSP scores for y(1), y(2), ... y(M)
    [MSP(y(1)|y(<1),x,), MSP(y(2)|y(<2),x,), …, MSP(y(M)|y(<M),x,)]
    """
    tokens = get_generated_tokens(completion)
    token_logprobs = get_token_logprobs(completion)

    token_indices_split = token_indices_by_line(tokens)
    msp_scores = []
    lines_info = []
    for line_indices in token_indices_split:
        if not line_indices:  # Skip empty lines
            continue

        line_tokens = [tokens[i] for i in line_indices]
        line = ''.join(line_tokens)
        line_logprob = 0.0

        for index in line_indices:
            _, logprob = token_logprobs[index]
            line_logprob += logprob

        line_prob = np.exp(line_logprob)
        msp_score = 1 - line_prob
        msp_scores.append(msp_score)
        lines_info.append((line, msp_score))

    if print_all:
        print(f"<Lines, Line probabilities, and MSP scores>")
        print("="*60)
        max_length = max(len(repr(line)) for line, _ in lines_info)
        for (line, msp_score) in lines_info:
            print(f"| {repr(line):<{max_length}} | {msp_score:.6f}")
        print("="*60)
    return lines_info

### PP(Perplexity)

In [6]:
import numpy as np

def cal_PP_global(completion, print_all=False):
    """
    Calculate the Perplexity (PP) uncertainty score for each line of the completion.

    Steps:
    1. For each line, calculate the PP uncertainty score.
    Suppose the prompt is x and the response y contains M lines, y(1), y(2), ... y(M), then the PP uncertainty scores are represented as follows:
    PP(y(m)|y(<m),x,) = 2^(-1/L * sum(l=1 to L) log2[P(yl(m)|y<l(m),y(<m),x,)])
    where L is the length of y(m)

    Args:
    completion: The completion object containing logprobs and content
    print_all: Boolean flag to print intermediate results

    Returns:
    A list of length M that contains the PP scores for y(1), y(2), ... y(M)
    [PP(y(1)|y(<1),x,), PP(y(2)|y(<2),x,), …, PP(y(M)|y(<M),x,)]
    """
    tokens = get_generated_tokens(completion)
    token_logprobs = get_token_logprobs(completion)
    token_indices_split = token_indices_by_line(tokens)

    line_logprobs = []
    for line_indices in token_indices_split:
        if not line_indices:  # Skip empty lines
            continue
        for index in line_indices:
            if index < len(token_logprobs):
                _, logprob = token_logprobs[index]
                line_logprobs.append(logprob)
    avg_log_prob = np.mean(line_logprobs)
    avg_log2_prob = avg_log_prob / np.log(2)
    pp_score = 2 ** (-avg_log2_prob)

    return pp_score

def cal_PP(completion, print_all=False):
    """
    Calculate the Perplexity (PP) uncertainty score for each line of the completion.

    Steps:
    1. For each line, calculate the PP uncertainty score.
    Suppose the prompt is x and the response y contains M lines, y(1), y(2), ... y(M), then the PP uncertainty scores are represented as follows:
    PP(y(m)|y(<m),x,) = 2^(-1/L * sum(l=1 to L) log2[P(yl(m)|y<l(m),y(<m),x,)])
    where L is the length of y(m)

    Args:
    completion: The completion object containing logprobs and content
    print_all: Boolean flag to print intermediate results

    Returns:
    A list of length M that contains the PP scores for y(1), y(2), ... y(M)
    [PP(y(1)|y(<1),x,), PP(y(2)|y(<2),x,), …, PP(y(M)|y(<M),x,)]
    """
    tokens = get_generated_tokens(completion)
    token_logprobs = get_token_logprobs(completion)

    token_indices_split = token_indices_by_line(tokens)
    pp_scores = []
    lines_info = []
    for line_indices in token_indices_split:
        if not line_indices:  # Skip empty lines
            continue
        line_tokens = [tokens[i] for i in line_indices]
        line = ''.join(line_tokens)
        L = len(line_indices)
        if L == 0:
            continue
        line_logprobs = []
        for index in line_indices:
            if index < len(token_logprobs):
                _, logprob = token_logprobs[index]
                line_logprobs.append(logprob)
        avg_log_prob = np.mean(line_logprobs)
        # Convert from natural log to log base 2
        avg_log2_prob = avg_log_prob / np.log(2)
        pp_score = 2 ** (-avg_log2_prob)
        pp_scores.append(pp_score)
        lines_info.append((line, pp_score))

    if print_all:
        print(f"<Lines, Average log2 probabilities, and PP scores>")
        print("="*70)
        max_length = max(len(repr(line)) for line, _ in lines_info)
        for (line, pp_score) in lines_info:
            print(f"| {repr(line):<{max_length}} | {pp_score:.6f}")
        print("="*70)

    return lines_info

### pTrue

In [24]:
import re

def gen_verification_prompt(original_user_massage, original_response, sentence, task_type='sentence'):
    """Generate a verification prompt for a given sentence."""

    verification_user_message = f"""Question: {original_user_massage}
Proposed Answer: {original_response}
Is the {task_type} "{sentence}" in the proposed answer correct?
(A) True
(B) False
Answer True or False without explanation:"""

    verification_chat = [
        {"role": "user", "content": verification_user_message}
      ]

    return verification_chat


def cal_pTrue_local(completion):
    """Calculate pTrue based on the completion."""
    tokens = get_generated_tokens(completion)
    token_top_logprobs = get_token_top_logprobs(completion)
    token_logprobs = get_token_logprobs(completion)

    for (primary_token, log_prob) in token_logprobs:
        if "true" in primary_token.lower():
            return np.exp(log_prob)

    for primary_token, top_logprobs in token_top_logprobs:
        true_prob = next((prob for token, prob in top_logprobs if "true" in token.lower()), None)
        if true_prob is not None:
            return np.exp(true_prob)  # Convert log probability to probability

    return 0

def cal_pTrue_global(client, original_prompt, completion, task_type='sentence', print_all=False):

    """
    Calculate P(True) uncertainty for whole sentence in the response.

    Args:
    original_prompt: The original prompt given to the model
    completion: The primary response completion object
    task_type: The type of task ('sentence', 'line', or 'code')
    print_all: Boolean flag to print intermediate results

    Returns:
    A pTrue score for whole sentence in the response
    """
    original_response = ''.join(get_generated_tokens(completion))
    verification_prompt = gen_verification_prompt(original_prompt, original_response, original_response, task_type)
    verification_completion = client.chat.completions.create(
      model="gpt-4-turbo-2024-04-09",
      messages= verification_prompt,
      temperature=0,
      logprobs=True,
      top_logprobs=5,
      max_tokens=512
    )
    pTrue_score = cal_pTrue_local(verification_completion)

    return pTrue_score

def cal_pTrue(client, original_prompt, completion, task_type='sentence', print_all=False):
    """
    Calculate P(True) uncertainty for each sentence in the response.

    Args:
    original_prompt: The original prompt given to the model
    completion: The primary response completion object
    task_type: The type of task ('sentence', 'line', or 'code')
    print_all: Boolean flag to print intermediate results

    Returns:
    A list of pTrue scores for each sentence in the response
    """
    tokens = get_generated_tokens(completion)
    original_response = ''.join(get_generated_tokens(completion))
    line_split, token_split = split_lines(tokens)
    pTrue_scores = []
    
    lines_info = []
    token_indices_split = token_indices_by_line(tokens)
    for line_indices in token_indices_split:
        if not line_indices:  # Skip empty lines
            continue
        line_tokens = [tokens[i] for i in line_indices]
        line = ''.join(line_tokens)
        L = len(line_indices)
        if L == 0:
            continue
    # for line_sentence in line_split:
        verification_prompt = gen_verification_prompt(original_prompt, original_response, line, task_type)
        # print('verification_prompt', verification_prompt)
        # print('sentence', line_sentence)
        verification_completion = client.chat.completions.create(
          model="gpt-4-turbo-2024-04-09",
          messages= verification_prompt,
          temperature=0,
          logprobs=True,
          top_logprobs=5,
          max_tokens=512
        )
        pTrue = cal_pTrue_local(verification_completion)
        pTrue_scores.append(pTrue)

        lines_info.append((line, pTrue))

    if print_all:
        print(f"<Sentences and P(True) scores>")
        print("="*70)
        max_length = max(len(repr(line_sentence)) for line_sentence, _ in lines_info)
        for (line_sentence, pTrue) in lines_info:
            print(f"| {repr(line_sentence):<{max_length}} | {pTrue:.6f}")
        print("="*70)

    return lines_info

### Similarity (Code vector)

In [25]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

def code_vectorized_similarity(code1, code2, method="Cosine", embedding_type="CodeBert"):
    """
    Calculate the similarity between two code segments using the specified method and embedding type.

    Args:
    code1, code2: the two code segments, each is a string
    method: "Cosine" or "Euclidean", the method to calculate the code similarity in vector space
    embedding_type: "CodeBert" or "GraphCodeBert", the embedding method

    Returns:
    A scalar that represents the similarity score of code1 and code2
    """
    # Choose the appropriate model based on the embedding type
    if embedding_type == "CodeBert":
        model_name = "microsoft/codebert-base"
    elif embedding_type == "GraphCodeBert":
        model_name = "microsoft/graphcodebert-base"
    else:
        raise ValueError("Invalid embedding type. Choose 'CodeBert' or 'GraphCodeBert'.")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize and generate embeddings
    def get_embedding(code):
        inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze()

    embedding1 = get_embedding(code1)
    embedding2 = get_embedding(code2)

    # Calculate similarity based on the specified method
    if method == "Cosine":
        similarity = torch.nn.functional.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()

    elif method == "Euclidean":
        similarity = 1 / (1 + torch.norm(embedding1 - embedding2)).item()
    else:
        raise ValueError("Invalid method. Choose 'Cosine' or 'Euclidean'.")

    return similarity

# # Example code snippets
# code1 = "def sum(a, b): return a + b"
# code2 = "def add(x, y): return x + y"

# # Calculate cosine similarity
# cosine_sim_codebert = code_vectorized_similarity(code1, code2, method="Cosine", embedding_type="CodeBert")
# cosine_sim_graphcodebert = code_vectorized_similarity(code1, code2, method="Cosine", embedding_type="GraphCodeBert")
# euclidean_sim_codebert = code_vectorized_similarity(code1, code2, method="Euclidean", embedding_type="CodeBert")
# euclidean_sim_graphcodebert = code_vectorized_similarity(code1, code2, method="Euclidean", embedding_type="GraphCodeBert")
# print("Cosine Similarity (CodeBert):", cosine_sim_codebert)
# print("Cosine Similarity (GraphCodeBERT):", cosine_sim_graphcodebert) # .item()
# print("Euclidean Similarity (CodeBert):", euclidean_sim_codebert)
# print("Euclidean Similarity (GraphCodeBERT):", euclidean_sim_graphcodebert)

### Simailarity (LLM)

In [9]:
import re

def generate_semantic_similarity_prompt(code_1, code_2):
    prompt = f"""
I am comparing two pieces of Python code and need to determine how similar they are. Consider only the functionality and semantics. Provide a similarity score between 0 and 1, where 0 means completely different and 1 means exactly the same.

Code 1:
{code_1}

Code 2:
{code_2}

Please analyze the similarity and provide a score without explanation, use the following format: semantic similarity score =
"""
    return prompt


def generate_general_similarity_prompt(code_1, code_2):
    prompt = f"""
I am comparing two pieces of Python code and need to determine how similar they are. Consider both their functionality and style. Provide a similarity score between 0 and 1, where 0 means completely different and 1 means exactly the same.

Code 1:
{code_1}

Code 2:
{code_2}

Please analyze the similarity and provide a score without explanation, use the following format: similarity score =
"""
    return prompt

def extract_similarity_score(response):
    # Use regex to find the similarity score in the response
    match = re.search(r'similarity score\s*=\s*(\d+(\.\d+)?)', response)
    if match:
        return float(match.group(1))
    else:
        return None

def code_LLM_similarity(code1, code2, type='semantic'):
    if type == 'semantic':
        prompt = generate_semantic_similarity_prompt(code1, code2)
    else:
        prompt = generate_general_similarity_prompt(code1, code2)

    chat = [
        {"role": "user", "content": prompt}
      ]

    completion = client.chat.completions.create(
      model="gpt-4-turbo-2024-04-09",
      messages= chat,
      temperature=0,
      logprobs=True,
      top_logprobs=5,
      max_tokens=512
    )

    response = completion.choices[0].message.content
    similarity_score = extract_similarity_score(response)

    print(f"<type>\n{type}")
    print(f"<user>\n{prompt}")
    print(f"<assistant>\n{completion.choices[0].message.content}")
    print(f"<similarity_score>\n{similarity_score}")

    return similarity_score

# Example usage
from openai import OpenAI

# code_1 = '''def add(a, b):
#     return a + b'''
# code_2 = '''def sum_numbers(x, y):
#     return x + y'''

# print('===========================')
# code_LLM_similarity(code_1, code_2, type='semantic')
# print('===========================')
# code_LLM_similarity(code_1, code_2, type='general')


## Task July-22(Jia) Generate the uncertainty scores for each line for the first 40 problems in HumanEval plus dataset, with gpt-turbo-4- and gpt-4o-mini

In [10]:
def add_score_to_dict(scores):
    # Initialize dictionary to hold line-wise uncertainty scores
    line_uncertainty_dict = {}
    # Populate the dictionary
    for method, data in scores.items():
        for (line, score) in data:
            if line not in line_uncertainty_dict:
                line_uncertainty_dict[line] = {}
            line_uncertainty_dict[line][method] = round(score, 3)  # Round the score to 3 decimal places
    return line_uncertainty_dict

def add_score_to_list(scores):
    # Initialize list to hold line-wise uncertainty scores
    line_uncertainty_list = []

    # Helper function to find an existing entry by line
    def find_entry_by_line(target_line):
        for entry in line_uncertainty_list:
            if entry[0] == target_line:
                return entry
        return None

    # Populate the list
    for method, data in scores.items():
        for (line, score) in data:
            entry = find_entry_by_line(line)
            if entry is None:
                # If no entry exists, create one with this line and an empty dictionary for scores
                entry = (line, {})
                line_uncertainty_list.append(entry)
            # Add or update the score for the current method
            entry[1][method] = round(score, 3)  # Round the score to 3 decimal places

    return line_uncertainty_list

In [11]:
def display_line_unc_scores(line_unc_scores):
    if not line_unc_scores:
        return

    # Determine the width for the line content column
    line_content_width = max(len(repr(line)) for line, _ in line_unc_scores) + 2

    # Determine the width for each uncertainty score column
    score_keys = line_unc_scores[0][1].keys()
    print('score_keys', score_keys)
    score_widths = {key: max(len(key), max(len(f"{scores[key]:.3f}") for _, scores in line_unc_scores)) for key in score_keys}

    # Adjust total width and print the header
    total_width = line_content_width + sum(score_widths.values()) + 3 * len(score_keys) + 2 * (len(score_keys) - 1)
    print("=" * total_width)
    header = f"{'Line Content':<{line_content_width}}"
    header += ''.join(f"| {key:^{score_widths[key]}} " for key in score_keys)
    print(header)
    print("=" * total_width)
    
    # Print each line with its uncertainty scores
    for line, scores in line_unc_scores:
        row = f"{repr(line):<{line_content_width}}"
        row += ''.join(f"| {scores[key]:>{score_widths[key]}.3f} " for key in score_keys)
        print(row)

    # Print the footer
    print("=" * total_width)

def save_to_txt(user_message, task_id, line_unc_scores, file_path):
    if not line_unc_scores:
        return  # If the list is empty, return without doing anything

    # Determine the width for the line content column
    line_content_width = max(len(repr(line)) for line, _ in line_unc_scores) + 2

    # Determine the width for each uncertainty score column
    score_keys = line_unc_scores[0][1].keys() if line_unc_scores else []
    score_widths = {key: max(len(key), max(len(f"{scores[key]:.3f}") for _, scores in line_unc_scores)) for key in score_keys}

    # Open the file for writing
    with open(file_path, 'w') as file:
        # Write user message and task id
        file.write(f"Task ID: {task_id}\n\n")
        file.write(f"User Message: {user_message}\n")

        # Print the header
        total_width = line_content_width + sum(score_widths.values()) + 3 * len(score_keys) + 2 * (len(score_keys) - 1)
        file.write("=" * total_width + "\n")
        header = f"{'Generate Content':<{line_content_width}}"
        header += ''.join(f"| {key:^{score_widths[key]}} " for key in score_keys)
        file.write(header + "\n")
        file.write("=" * total_width + "\n")
        
        # Print each line with its uncertainty scores
        for line, scores in line_unc_scores:
            row = f"{repr(line):<{line_content_width}}"
            row += ''.join(f"| {scores[key]:>{score_widths[key]}.3f} " for key in score_keys)
            file.write(row + "\n")

        # Print the footer
        file.write("=" * total_width + "\n")

In [None]:
def transfer_to_jsonDict(temp_data, final_prompt, response, unc_scores, line_unc_scores)

    temp_dict = {}
    temp_dict['task_id'] = temp_data['task_id']
    temp_dict['prompt'] = temp_data['prompt']
    temp_dict['canonical_solution'] = temp_data['canonical_solution']
    temp_dict['entry_point'] = temp_data['entry_point']
    temp_dict['test'] = temp_data['test']
    temp_dict['final_prompt'] = final_prompt
    temp_dict['response'] = response
    temp_dict['unc_scores'] = unc_scores
    temp_dict['line_unc_scores'] = line_unc_scores
    
    return temp_dict

def save_to_json(data_list, save_path):
    # Writing the list of dictionaries to a JSON file
    with open(save_path, 'w') as json_file:
        json.dump(data_list, json_file, indent=0) 

In [32]:
from datasets import load_dataset
from openai import OpenAI

def main(model_id, client, subsample_size = 1):
    system_message = '''You are a Python code generator. Generate a complete and functioning Python function based on the provided code snippet.
    Ensure the function includes the original instructions in the comments, in-line comments for each line of code, and import statements for any required dependencies.
    Do not include main function. Enclose your code inside a ```python``` block.'''

    dataset = load_dataset("evalplus/humanevalplus")

    data_list = []
    for i in range(subsample_size):

        user_message = dataset['test'][i]['prompt']
        task_id = dataset['test'][i]['task_id']

        print('task_id', task_id)
        chat = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
          ]

        completion = client.chat.completions.create(
          model= model_id,
          messages= chat,
          temperature=0,
          logprobs=True,
          top_logprobs=5,
          max_tokens=512
        )

        uq_methods = {
            'PP': cal_PP(completion, print_all=False),
            'MSP': cal_MSP(completion, print_all=False),
            'pTrue': cal_pTrue(client, user_message, completion, task_type='sentence', print_all=False),
            'MTE': cal_MTE(completion, print_all=False),
        }

        uq_global = {
            'PP_global': cal_PP_global(completion, print_all=False),
            'MSP_global': cal_MSP_global(completion, print_all=False),
            'pTrue_global': cal_pTrue_global(client, user_message, completion, task_type='sentence', print_all=False),
            'MTE_global': cal_MTE_global(completion, print_all=False)
        }

        scores = {}
        for uq in list(uq_methods.keys()):
            line_score = uq_methods[uq]
            print(uq, 'line_score', line_score)
            scores[uq] = line_score
        line_uncertainty_list = add_score_to_list(scores)
        # display_line_unc_scores(line_uncertainty_list)

        output_text_path = f'/home/jxl220096/llm_uq/logs/{model_id}/task_id.txt'
        save_to_txt(user_message, task_id, line_uncertainty_list, f'/home/jxl220096/llm_uq/logs/{model_id}/{task_id}.txt')

        temp_dict = transfer_to_jsonDict(temp_data = dataset['test'][i], final_prompt = chat, response = completion.choices[0].message.content, unc_scores = uq_global, line_unc_scores = line_uncertainty_list)
        data_list.append(temp_dict)
    
    output_json_path = f'/home/jxl220096/llm_uq/logs/{model_id}/{model_id}.json'
    save_to_json(data_list, output_json_path)
