In [1]:
import pandas as pd
import numpy as np
import random
import os

In [2]:
# Function to compute softmax values
def softmax(values):
    exp_values = np.exp(values - np.max(values))
    return exp_values / exp_values.sum()

# Function to generate Gaussian and softmax values for people and places
def generate_probabilities(people, places):
    people_gauss = {person: random.gauss(0, 1) for person in people}
    place_gauss = {place: random.gauss(0, 1) for place in places}
    people_softmax = softmax(np.array(list(people_gauss.values())))
    place_softmax = softmax(np.array(list(place_gauss.values())))
    people_probs = dict(zip(people, people_softmax))
    place_probs = dict(zip(places, place_softmax))
    return people_probs, place_probs

In [3]:
# Function to generate objects set
def generate_objects(people, places, max_sentences, randomized):
    objects = set()
    if randomized:
        people_probs, place_probs = generate_probabilities(people, places)
        while len(objects) < max_sentences:
            person = random.choices(list(people_probs.keys()), weights=list(people_probs.values()))[0]
            place = random.choices(list(place_probs.keys()), weights=list(place_probs.values()))[0]
            objects.add((person, place))
    else:
        lowercase_letters = 'abcdefghijklmnor'
        uppercase_letters = 'ABCDEFGHIJKLMNOR'
        ran_people = list(random.sample(people, len(people)))
        ran_places = list(random.sample(places, len(places)))
        people_dict = {lowercase_letters[i]: ran_people[i] for i in range(len(lowercase_letters))}
        places_dict = {uppercase_letters[i]: ran_places[i] for i in range(len(uppercase_letters))}
        predefined_pairs = {
            ('a', 'A'), ('b', 'B'), ('c', 'C'), ('d', 'D'), ('e', 'E'), ('f', 'F'), ('g', 'G'), ('h', 'H'),
            ('i', 'I'), ('j', 'J'), ('k', 'K'), ('l', 'L'), ('e', 'K'), ('r', 'R'), ('g', 'J'), ('h', 'R'),
            ('i', 'L'), ('m', 'M'), ('n', 'N'), ('o', 'O'), ('d', 'M'), ('r', 'N'), ('f', 'O'), ('g', 'M'),
            ('h', 'N'), ('i', 'O')
        }
        objects = {(people_dict[p], places_dict[l]) for p, l in predefined_pairs}
    objects = list(random.sample(list(objects), len(objects)))
    return objects



# Function to generate prompt components
def generate_prompt_components(objects, inst_preamble, fact_format, fact_order, fact_sep, query_format):
    fact_list = fact_sep.join([fact_format.format(*[obj[i] for i in fact_order]) for obj in objects]) + fact_sep
    prompt = f"{inst_preamble}{fact_list}{query_format}"
    return prompt

# Function to print objects and prompts
def print_objects_and_prompts(objects, prompt):
    print("Objects:", objects)
    print("Prompts:", prompt)

In [4]:
# Function to calculate fan value
def fan(x, objects, index=None):
    return sum(1 for obj in objects if x in obj)# / len(objects)


# Function to print fan values
def print_fan_values(objects, fact_format):
    for obj in objects:
        person, plc = obj
        fan_person = fan(person, objects)
        fan_place = fan(plc, objects)
        print('The sentence "{}" has fan values: person: {}, place: {}'.format(fact_format.format(person,plc), fan_person, fan_place))


In [5]:
# Function to generate final data and save to CSV
def generate_and_save_data(people, places, objects, prompt, path, randomized, target_stimuli, include_all_cases, query_order):
    data = []
    #instead {istruction}{fact_list}{query}. fact_list: person and place grouped together, query: senetnce that does if this is that

    for stim in target_stimuli:
        for person in people:
            for place in places:
              if include_all_cases or (stim == "present" and (person, place) in objects):# or (stim == "absent" or (person, place) not in objects):
                fan_person = fan(person, objects)
                fan_place  = fan(place, objects)
                true_category = "present" if (person, place) in objects else "absent"
                # Combine prompt and preamble
                query_prompt = prompt.format(*[[person, place][i] for i in query_order])
                data.append([query_prompt, person, fan_person, place, fan_place, stim, true_category])
    new_prompts_df = pd.DataFrame(data, columns=['Preamble', 'Person', 'Fan_Person', 'Place', 'Fan_Place', 'Dependent_Variable', 'True_Category'])
    new_prompts_df.to_csv(path, index=False )

In [6]:
# Regenerating Prompts
def run_experiment(people, places,
                   output_dir,
                   max_sentences=50,
                   randomized=True,
                   inst_preamble=None,
                   fact_format=None,
                   fact_order=None,
                   fact_sep=None,
                   query_format=None,
                   query_order=None,
                   target_stimuli=None,
                   include_all_cases = True):
    objects = generate_objects(people, places, max_sentences, randomized)

    prompt = generate_prompt_components(objects, inst_preamble, fact_format, fact_order, fact_sep, query_format)
    # print_objects_and_prompts(objects, prompt)
    # print_fan_values(objects, fact_format)  # Print fan values
    new_prompts_path = output_dir + '/' + ('And_Aug' if randomized else 'And_Orig')
    os.makedirs(new_prompts_path, exist_ok=True)
    generate_and_save_data(people, places, objects, prompt, new_prompts_path + '/prompts.csv', randomized, target_stimuli, include_all_cases, query_order)

In [9]:
# Parameters
# Ensure the output directory exists

people = list(pd.read_csv('./People.csv')['People'])
places = list(pd.read_csv('./Places.csv')['Place'])

inst = 'Following is a list that contains a number of people and the places in which they are located. After the list, a person will be judged as either present or absent in a specified place. When asked about person A in place B, if the list says that person A is in place B, answer with present. If the list does not say that person A is in place B, answer with absent. The list of people and places is: '
few_shot = 'The Mechanic is in the Mall. According to the list, in the Mall, the Mechanic is present. According to the list, in the Airport, the Pilot is absent. '
query = few_shot + 'According to the list, in the {}, the {} is '

out_dir = './Random/run_{}'

target_stimuli = ['absent', 'present']

for i in range(10):
    user_defined_components = {
        'inst_preamble': inst,
        'fact_format': 'The {} is in the {}',
        'fact_order': [0, 1],
        'fact_sep': '. ',
        'query_format': query,
        'query_order': [1, 0],
        'target_stimuli': target_stimuli,
    }

    run_experiment(people, places, out_dir.format(i), max_sentences=50, randomized=True,  **user_defined_components, include_all_cases = True)
    run_experiment(people, places, out_dir.format(i),                   randomized=False, **user_defined_components, include_all_cases = True)

Oseremhen came up with the following ideas/additions to consider:

Additional object type combos:
    Positional
        Bird vs Place
        Furniture vs Place
        Animal vs Place(Habitat?)
        Person vs Place
            Name
            Profession
    
    Usage
        Flower 
        Toy vs Person
        Food vs Person
            Fruit vs Person
        Sport vs Equipment

    Grade vs Course

    minimal group paradigm (nonsense with nonsense)