In [305]:
import pandas as pd 
import numpy as np 
import openai
import os

# our code and script
from utils.helpers import save_pickle, load_pickle

openai.api_key = ""

# Processing

In [2]:
path = '../data/'
fname = 'gz_decals_volunteers_5.csv'

In [3]:
csvfile = pd.read_csv(path + fname)

In [359]:
MAPPING = {'smooth-or-featured_smooth_fraction': 'Smooth',
           'smooth-or-featured_featured-or-disk_fraction': 'Featured or Disk',
           'smooth-or-featured_artifact_fraction': 'Artifact',
           'disk-edge-on_yes_fraction': 'Edge On Disk (Yes)',
           'disk-edge-on_no_fraction': 'Edge On Disk (No)',
           'has-spiral-arms_yes_fraction': 'Spiral Arms (Yes)',
           'has-spiral-arms_no_fraction': 'Spiral Arms (No)',
           'bar_strong_fraction': 'Strong Bar',
           'bar_weak_fraction': 'Weak Bar',
           'bar_no_fraction': 'No Bar',
           'bulge-size_dominant_fraction': 'Central Bulge (Dominant)',
           'bulge-size_large_fraction': 'Central Bulge (Large)',
           'bulge-size_moderate_fraction': 'Central Bulge (Moderate)',
           'bulge-size_small_fraction': 'Central Bulge (Small)',
           'bulge-size_none_fraction': 'Central Bulge (None)',
           'how-rounded_round_fraction': 'Round',
           'how-rounded_in-between_fraction': 'In Between',
           'how-rounded_cigar-shaped_fraction': 'Cigar Shaped',
           'edge-on-bulge_boxy_fraction': 'Bulge (Boxy)',
           'edge-on-bulge_none_fraction': 'Bulge (None)',
           'edge-on-bulge_rounded_fraction': 'Bulge (Rounded)',
           'spiral-winding_tight_fraction': 'Spiral Winding (Tight)',
           'spiral-winding_medium_fraction': 'Spiral Winding (Medium)',
           'spiral-winding_loose_fraction': 'Spiral Winding (Loose)',
           'spiral-arm-count_1_fraction': 'Spiral Arms (1)',
           'spiral-arm-count_2_fraction': 'Spiral Arms (2)',
           'spiral-arm-count_3_fraction': 'Spiral Arms (3)',
           'spiral-arm-count_4_fraction': 'Spiral Arms (4)',
           'spiral-arm-count_more-than-4_fraction': 'Spiral Arms (More Than 4)',
           'spiral-arm-count_cant-tell_fraction': 'Spiral Arms (cannot tell)',
           'merging_none_fraction': 'Merging (None)',
           'merging_minor-disturbance_fraction': 'Merging (Minor Disturbance)',
           'merging_major-disturbance_fraction': 'Merging (Major Disturbance)',
           'merging_merger_fraction': 'Merging (Merger)'}

LABELS = {

    'task_1': ['Smooth', 'Featured or Disk', 'Artifact'],
    'task_2': ['Round', 'In Between', 'Cigar Shaped'],
    'task_3': ['Edge On Disk (Yes)', 'Edge On Disk (No)'],
    'task_4': ['Merging (Merger)', 'Merging (Major Disturbance)', 'Merging (Minor Disturbance)', 'Merging (None)'],
    'task_5': ['Bulge (Rounded)', 'Bulge (Boxy)', 'Bulge (None)'],
    'task_6': ['No Bar', 'Weak Bar', 'Strong Bar'],
    'task_7': ['Spiral Arms (Yes)', 'Spiral Arms (No)'],
    'task_8': ['Spiral Winding (Tight)', 'Spiral Winding (Medium)', 'Spiral Winding (Loose)'],
    'task_9': ['Spiral Arms (1)', 'Spiral Arms (2)', 'Spiral Arms (3)', 'Spiral Arms (4)',
               'Spiral Arms (More Than 4)', 'Spiral Arms (cannot tell)'],
    'task_10': ['Central Bulge (None)', 'Central Bulge (Small)', 'Central Bulge (Moderate)',
                'Central Bulge (Large)', 'Central Bulge (Dominant)']

}

TASK_MAPPING = {

    'task_1': {'Smooth': 'smooth', 'Featured or Disk': 'has features or disk', 'Artifact': 'artifact'},
    'task_2': {'Round': 'round', 'In Between': 'elliptical', 'Cigar Shaped': 'cigar-shaped'},
    'task_3': {'Edge On Disk (Yes)': 'has an edge-on disk', 'Edge On Disk (No)': 'does not have an edge-on disk'},
    'task_4': {'Merging (Merger)': 'merging', 'Merging (Major Disturbance)': 'merging with major disturbance', 'Merging (Minor Disturbance)': 'merging with minor disturbance', 'Merging (None)': 'not merging'},
    'task_5': {'Bulge (Rounded)': 'rounded central bulge', 'Bulge (Boxy)': 'boxy central bulge', 'Bulge (None)': 'no central bulge'},
    'task_6': {'No Bar': 'no bar', 'Weak Bar': 'weak bar', 'Strong Bar': 'strong bar'},
    'task_7': {'Spiral Arms (Yes)': 'has spiral arms', 'Spiral Arms (No)': 'does not have spiral arms'},
    'task_8': {'Spiral Winding (Tight)': 'tight spiral winding', 'Spiral Winding (Medium)': 'medium spiral winding', 'Spiral Winding (Loose)': 'loose spiral winding'},
    'task_9': {'Spiral Arms (1)': 'one spiral arm', 'Spiral Arms (2)': 'two spiral arms', 'Spiral Arms (3)': 'three spiral arms', 'Spiral Arms (4)': 'four spiral arms',
               'Spiral Arms (More Than 4)': 'more than four spiral arms', 'Spiral Arms (cannot tell)': 'no spiral arms'},
    'task_10': {'Central Bulge (None)': 'no central bulge', 'Central Bulge (Small)': 'small central bulge', 'Central Bulge (Moderate)': 'moderate central bulge',
                'Central Bulge (Large)': 'large central bulge', 'Central Bulge (Dominant)': 'dominant central bulge'}

}

NUM_TASKS = 10

LABELS_PER_TASK = {'task_1': 3,
                   'task_2': 3,
                   'task_3': 2,
                   'task_4': 4,
                   'task_5': 3,
                   'task_6': 3,
                   'task_7': 2,
                   'task_8': 3,
                   'task_9': 6,
                   'task_10': 5
                   }

TASKS_ORDERED = np.concatenate([LABELS['task_' + str(i + 1)] for i in range(NUM_TASKS)])

IMGCOLS = ['iauname', 'png_loc']

In [5]:
ncols = csvfile.shape[1]

# the vote fraction
vote_fraction = csvfile[csvfile.columns[['fraction' in csvfile.columns[i] for i in range(ncols)]]]

# the labels
labels = vote_fraction.copy()

# rename the columns according to labels in Decision Tree
labels.rename(MAPPING, axis=1, inplace=True)

# order the labels according to the task number
labels = labels[TASKS_ORDERED]

### Convert the Labels

In [39]:
labels[labels < 0.5] = 0
labels[labels >= 0.5] = 1
labels = labels.fillna(0)

# Functions

In [219]:
def question_answer_openai(question):
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=question,
    temperature=0.9,
    max_tokens=150,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0.6,
    stop=[" Human:", " AI:"]
    )
    text = response['choices'][0]['text'][2:]
    return text

In [256]:
def find_labels(tasks: dict) -> np.ndarray:
    """Find the labels of a galaxy given the outputs from the neural network.
    The tasks is a dictionary in the following format:
    tasks = {
        'task_1' : [1, 0, 0]
        'task_2' : [0, 1, 0]
        .
        .
        .
        'task_10': [0, 0, 1, 0, 0]
    }
    Args:
        tasks (dict): A dictionary with outputs from the neural network.
    Returns:
        np.ndarray: A numpy array consisting of the labels.
    """

    labels = {k: np.asarray(v) for k, v in LABELS.items()}
    tasks = {k: np.asarray(v) for k, v in tasks.items()}

    record_labels = pd.DataFrame(columns=['task_' + str(i + 1) for i in range(NUM_TASKS)])

    # Sometimes, there can be more than 1 label (due to equal probability by volunteers' votes)
    # If this happens, we pick the first selected label, hence [0] below.
    record_labels.at[0, 'task_1'] = list(labels['task_1'][tasks['task_1'] == 1])

    if tasks['task_1'][0] == 1:

        record_labels.at[0, 'task_2'] = list(labels['task_2'][tasks['task_2'] == 1])
        record_labels.at[0, 'task_4'] = list(labels['task_4'][tasks['task_4'] == 1])

    elif tasks['task_1'][1] == 1:
        record_labels.at[0, 'task_3'] = list(labels['task_3'][tasks['task_3'] == 1])

        if tasks['task_3'][0] == 1:
            record_labels.at[0, 'task_5'] = list(labels['task_5'][tasks['task_5'] == 1])
            record_labels.at[0, 'task_4'] = list(labels['task_4'][tasks['task_4'] == 1])

        else:
            record_labels.at[0, 'task_6'] = list(labels['task_6'][tasks['task_6'] == 1])
            record_labels.at[0, 'task_7'] = list(labels['task_7'][tasks['task_7'] == 1])

            if tasks['task_7'][0] == 1:
                record_labels.at[0, 'task_8'] = list(labels['task_8'][tasks['task_8'] == 1])
                record_labels.at[0, 'task_9'] = list(labels['task_9'][tasks['task_9'] == 1])
                record_labels.at[0, 'task_10'] = list(labels['task_10'][tasks['task_10'] == 1])
                record_labels.at[0, 'task_4'] = list(labels['task_4'][tasks['task_4'] == 1])

            else:
                record_labels.at[0, 'task_10'] = list(labels['task_10'][tasks['task_10'] == 1])
                record_labels.at[0, 'task_4'] = list(labels['task_4'][tasks['task_4'] == 1])

    return record_labels

In [257]:
def generate_keywords_sentence(dataframe, index):
    
    dummy_labels = dataframe.iloc[index]
    label = dict()
    for i in range(NUM_TASKS):
        task = dummy_labels[LABELS["task_" + str(i + 1)]].values.astype(int)
        label["task_" + str(i + 1)] = task
        
    # generate tree
    label_1 = find_labels(label)
    
    # generate list of words
    record = list()
    for k in range(NUM_TASKS):
        for keywords in label_1[f'task_{k+1}']:
            if type(keywords) == list:
                nitems = len(keywords)
                for j in range(nitems):
                    record.append(TASK_MAPPING[f'task_{k+1}'][keywords[j]])
    keywords = ', '.join(record)
    question = 'short sentence: galaxy, image, '+ ', '.join(record)
    return keywords, question

# Example

In [242]:
keywords, question = generate_keywords_sentence(labels, 121)
answer = question_answer_openai(question)

In [243]:
keywords

'smooth, round, merging with minor disturbance'

In [244]:
question

'short sentence: galaxy, image, smooth, round, merging with minor disturbance'

In [246]:
answer

'Two merging galaxies, one smooth and round and the other with minor disturbance, create a mesmerizing image.'

# Training Set

In [273]:
def generate_training_set(dataframe, nexamples, start_index = 0, save=True):
    record_answer = list()
    for i in range(nexamples):
        dictionary = {}
        keywords, question = generate_keywords_sentence(dataframe, i+start_index)
        answer = question_answer_openai(question)
        dictionary['keywords'] = keywords
        dictionary['question'] = question
        dictionary['answer'] = answer
        if save:
            path = '../prompts/openai'
            os.makedirs(path, exist_ok=True)
            save_pickle(dictionary, path, f'object_{i+start_index}')
        record_answer.append(dictionary)
    return record_answer

In [373]:
%%time
answers = generate_training_set(labels, 1000, start_index = 2500, save=True)

CPU times: user 12.2 s, sys: 191 ms, total: 12.4 s
Wall time: 9min 49s


# Table for Finetuning NLP Model

In [374]:
def generate_input_data(nobjects = 2000, path = 'prompts/openai', save = True):
    record = {}
    record['input_text'] = list()
    record['target'] = list()
    for i in range(nobjects):
        obj = load_pickle(path, f'object_{i}')
        keywords = obj['keywords'].replace(',', ' |')
        record['input_text'].append(keywords)
        record['target'].append(obj['answer'].replace('"', '').splitlines()[-1])
    record = pd.DataFrame(record)
    if save:
        os.makedirs('../data', exist_ok=True)
        record.to_csv('../data/training.csv')
    return record

In [375]:
nobjects = 2500
data = generate_input_data(nobjects, path = '../prompts/openai', save=True)
data_path = pd.concat([csvfile[IMGCOLS].iloc[0: nobjects], data], axis = 1)
data_path_confident = data_path[~data_path['input_text'].eq('')].reset_index(drop=True)
data_path_confident.to_csv('../data/training_paths.csv')

In [379]:
data_path_confident.shape[0]

2393