In [None]:
import numpy as np
from scipy.stats import norm
from scipy.optimize import minimize
import tqdm
import pandas as pd

In [None]:
ask_prompts = [
    'Rate the quality of this painting.\n<|image|>',
    'How would you rate the quality of this painting?\n<|image|>',
    'How would you judge the quality of this painting?\n<|image|>',
    'How do you assess the quality of this painting?\n<|image|>',
    'Could you evaluate the quality of this painting?\n<|image|>',
    'What is your quality rating for this painting?\n<|image|>',
    'Can you rate the quality of this painting?\n<|image|>',
    'What do you think about the quality of this painting?\n<|image|>',
    'Can you judge the quality of this painting?\n<|image|>',
    "What's your opinion on the quality of this painting?\n<|image|>"
]

Load the Score data

In [None]:
score_data = pd.read_csv('../dataset/APDD/train.csv')

Define $l_i$

In [None]:
pdf_value = [10.,7.5,5,3.5,1.0]

Define $i$

In [None]:
def get_level(score,pdf_value):
    if score <= pdf_value[-1]:
        return 'bad'
    elif score <= pdf_value[-2]:
        return 'poor'
    elif score <= pdf_value[-3]:
        return 'fair'
    elif score <= pdf_value[-4]:
        return 'good'
    else:
        return 'excellent'

In [None]:
def get_mu_sigma(P_target):

    # Define the objective function to minimize the error
    def objective(params):
        mu, sigma, = params
        level_prob = norm.pdf(pdf_value, loc=mu, scale=sigma)
        level_prob = level_prob/level_prob.sum()
        prob_discrete = np.inner(level_prob, np.array(pdf_value))
        
        # Return the gap from the target probability
        return abs(prob_discrete - P_target)

    # Initial guess
    initial_guess = [P_target, 1]
    options = {
        'disp': True,
    }

    # Use the minimization function of scipy to find the optimal mu and sigma
    result = minimize(objective, initial_guess, bounds=[(-10, 10), (0.001, 1000)],
                      options=options)

    # Output the optimal mu and sigma
    mu_optimal, sigma_optimal = result.x
    return mu_optimal, sigma_optimal

def get_score(score):
    '''
    Get the probability values for each rating level. The steps are as follows:
        1. Calculate the mu and sigma corresponding to the most Gaussian distribution expressing the score.
        2. Calculate the probability distribution for each rating level.
        3. Normalize the probability distribution.
        4. Return the normalized probability distribution.
    '''
    mu_optimal,sigma_optimal= get_mu_sigma(score)
    level_prob = norm.pdf(pdf_value, loc=mu_optimal, scale=sigma_optimal)
    level_prob = level_prob.clip(0,1)
    return level_prob/level_prob.sum()

In [None]:
dict_list = []
# counter = 100
for index, row in tqdm.tqdm(score_data.iterrows()):
    filename = row['filename']
    Score = row['Score']
    
    id = filename + '->' + str(Score)
    image = filename
    gt_score = Score
    level_probs = get_score(Score)
    level = get_level(Score)
    
    ask_index = np.random.randint(0, len(ask_prompts))
    conversations = [{"from": "human", "value": ask_prompts[ask_index]},
                     {"from": "gpt", "value": f"The quality of the painting is {level}."}]

    dict = {
                "id": id, 
                "image": image,
                "gt_score": gt_score, 
                "level_probs": level_probs, 
                "conversations": conversations
            }
    dict_list.append(dict)
    
    # counter -= 1
    # if counter == 0:
    #     break
    

  level_prob = level_prob/level_prob.sum()
8019it [01:40, 79.70it/s] 


Save

In [None]:
# import json

# for d in dict_list:
#     d['level_probs'] = list(d['level_probs'])

# with open('dataset/APDD/score.json', 'w') as f:
#     f.write(json.dumps(dict_list, indent=4))