In [20]:
import pandas as pd

frame_descriptions = pd.read_csv('frame_descriptions_json.csv')

system_prompt = """### Task:
You are given a sentence and a frame with its associated frame elements and sometimes examples. Your task is to label the frame elements in the sentence using JSON. Keys should only be one of the defined frame elements. Do not make up your own frame elements, and do not remove or change the input in any way. Identify the frame elements based on the highlighted target word. 

### Notes:
- Return the tagged sentence in a ```json ``` code block.
- Texts must not overlap.
"""

user_prompt = """### Frame Information:
{frame_info}

### Input:
{input_sentence}
"""

assistant_prompt = """### Output: 
```json
{output_sentence}
```
"""


In [23]:
print(frame_descriptions.values[0][1])

Frame Name: Abandonment
Frame Definition: An Agent leaves behind a Theme effectively rendering it no longer within their control or of the normal security as one's property.

Frame Elements:
Agent (Core): The Agent is the person who acts to leave behind the Theme.
Theme (Core): The Theme is the entity that is relinguished to no one from the Agent's possession.
Place (Peripheral): The location where the Agent gives up the Theme.
Time (Peripheral): When the Agent gives up the Theme.
Manner (Peripheral): The style in which the Agent gives up the Theme.
Duration (Peripheral): For what expanse of time the Agent has given up the Theme.
Explanation (Extra-Thematic): Explanation denotes a proposition from which the act of abandonment logically follows.
Depictive (Extra-Thematic): The FE Depictive describes the Agent during the abandoning event.
Degree (Peripheral): The extent to which the Agent leaves the Theme behind.
Means (Peripheral): An action performed by the Agent that accomplishes the 

## Train Data

In [None]:
import pickle

with open('../../data/raw/os_train.pkl', 'rb') as f:
    train_data = pickle.load(f)

num_examples = 5

In [85]:

def greedy_minimal_diverse_set(df, max_examples=5):
    covered_fes = set()
    
    selected_samples = []
    
    samples_fes = [set([fe[0] for fe in fes]) for fes in df.fe.values if len(fes) > 0]
    ignored_samples = set()
    
    while len(selected_samples) < max_examples and len(samples_fes) > 0:
        most_diverse_sample = (None, -1)

        for i, sample_fes in enumerate(samples_fes):
            if i in ignored_samples:
                continue
            
            sample_diversity = len(sample_fes - covered_fes)
            
            if sample_diversity > most_diverse_sample[1]:
                most_diverse_sample = (i, len(sample_fes - covered_fes))
            
            if sample_diversity == 0:
                ignored_samples.add(i)
        
        if most_diverse_sample[0] is None or most_diverse_sample[1] == 0:
            break
        
        ignored_samples.add(most_diverse_sample[0])
        selected_samples.append(most_diverse_sample[0])
        covered_fes.update(samples_fes[most_diverse_sample[0]])
    
    return selected_samples
            
new_samples = []
for frame, df in train_data.groupby(['frame']):
    # print(frame)
    # print([tuple([fe[0] for fe in fes]) for fes in df.fe.values if len(fes) > 0])
    
    diverse_index = greedy_minimal_diverse_set(df, max_examples=10)
    diverse_samples = df.iloc[diverse_index]
    extra_samples = df[~df.index.isin(diverse_index)].sample(n=max(num_examples-len(diverse_samples), 0), random_state=1) if len(diverse_samples) > num_examples else df[~df.index.isin(diverse_index)].head(n=max(num_examples-len(diverse_samples), 0))
    new_samples.append(diverse_samples)
    new_samples.append(extra_samples)

new_samples = pd.concat(new_samples)
new_train_data = new_samples.sample(frac=1, random_state=1)

In [86]:
# Create dicts for each sample:
# - input_sentence w/ target span surrounded with ** for highlighting
# - frame_name
# - frame_elements (as text, not spans)

def get_json_output(text, frame_elements):
    sorted_fes = sorted(frame_elements.items(), key=lambda x: text.find(x[1]))

    sorted_fes = dict(sorted_fes)

    return sorted_fes

test_samples = []
frame_descriptions_dict = frame_descriptions.set_index('name').to_dict()['description']

for row in new_train_data.iterrows():
    # Index(['target', 'text', 'tokens', 'lu', 'frame', 'fe'], dtype='object')
    idx, data = row
    
    # Get input sentence
    input_sentence = data['text'][:data['target'][0]] + '**' + data['text'][data['target'][0]:data['target'][1]] + '**' + data['text'][data['target'][1]:]
    
    # Get frame name
    frame_name = data['frame']
    
    # Get frame elements
    frame_elements = {}
    for fe in data['fe']:
        frame_elements[fe[0]] = data['text'][fe[1]:fe[2]]
        
    # Get expected output
    expected_output = get_json_output(data['text'], frame_elements)

    sample = {
        'messages': [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_prompt.format(frame_info=frame_descriptions_dict[frame_name].strip(), input_sentence=input_sentence)
            },
            {
                'role': 'assistant',
                'content': assistant_prompt.format(output_sentence=expected_output)
            }
        ]
    }
    
    if len(frame_elements) > 0:
        test_samples.append(sample)

train_df = pd.DataFrame(test_samples)
train_df.sample(frac=1).to_json('fn1.7-small-train-prompts.jsonl', orient='records', lines=True)

## Val Data

In [89]:
import pickle

with open('../../data/raw/os_dev.pkl', 'rb') as f:
    val_data = pickle.load(f)


In [93]:
# Create dicts for each sample:
# - input_sentence w/ target span surrounded with ** for highlighting
# - frame_name
# - frame_elements (as text, not spans)

def get_json_output(text, frame_elements):
    sorted_fes = sorted(frame_elements.items(), key=lambda x: text.find(x[1]))
    
    sorted_fes = dict(sorted_fes)
    
    return sorted_fes

test_samples = []
frame_descriptions_dict = frame_descriptions.set_index('name').to_dict()['description']

for row in val_data.iterrows():
    # Index(['target', 'text', 'tokens', 'lu', 'frame', 'fe'], dtype='object')
    idx, data = row
    
    # Get input sentence
    input_sentence = data['text'][:data['target'][0]] + '**' + data['text'][data['target'][0]:data['target'][1]] + '**' + data['text'][data['target'][1]:]
    
    # Get frame name
    frame_name = data['frame']
    
    # Get frame elements
    frame_elements = {}
    for fe in data['fe']:
        frame_elements[fe[0]] = data['text'][fe[1]:fe[2]]
        
    # Get expected output
    expected_output = get_json_output(data['text'], frame_elements)

    sample = {
        'messages': [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_prompt.format(frame_info=frame_descriptions_dict[frame_name].strip(), input_sentence=input_sentence)
            },
            {
                'role': 'assistant',
                'content': assistant_prompt.format(output_sentence=expected_output)
            }
        ]
    }
    
    if len(frame_elements) > 0:
        test_samples.append(sample)

val_df = pd.DataFrame(test_samples)
val_df.to_json('fn1.7-val-prompts.jsonl', orient='records', lines=True)
# pd.DataFrame(test_samples).to_json('fn1.7-val-prompts.jsonl', orient='records', lines=True)

In [None]:
# import tiktoken
# import pandas as pd

# # Count number of tokens in training data
# encoding = tiktoken.encoding_for_model('gpt-4o-mini')

# a = train_df.messages.apply(lambda x: sum([len(encoding.encode(y['content'])) for y in x]))
# b = val_df.messages.apply(lambda x: sum([len(encoding.encode(y['content'])) for y in x]))
# a.sum() * (3/1000000) + b.sum() * (0.2 / 1000000)

np.float64(4.854758)

## Test Data

In [96]:
import pickle

with open('../../data/raw/os_test.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [97]:
# Create dicts for each sample:
# - input_sentence w/ target span surrounded with ** for highlighting
# - frame_name
# - frame_elements (as text, not spans)

def get_json_output(text, frame_elements):
    sorted_fes = sorted(frame_elements.items(), key=lambda x: text.find(x[1]))
    
    sorted_fes = dict(sorted_fes)
    
    return sorted_fes

test_samples = []
frame_descriptions_dict = frame_descriptions.set_index('name').to_dict()['description']

for row in test_data.iterrows():
    # Index(['target', 'text', 'tokens', 'lu', 'frame', 'fe'], dtype='object')
    idx, data = row
    
    # Get input sentence
    input_sentence = data['text'][:data['target'][0]] + '**' + data['text'][data['target'][0]:data['target'][1]] + '**' + data['text'][data['target'][1]:]
    
    # Get frame name
    frame_name = data['frame']
    
    # Get frame elements
    frame_elements = {}
    for fe in data['fe']:
        frame_elements[fe[0]] = data['text'][fe[1]:fe[2]]
        
    # Get expected output
    expected_output = get_json_output(data['text'], frame_elements)

    sample = {
        'messages': [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_prompt.format(frame_info=frame_descriptions_dict[frame_name].strip(), input_sentence=input_sentence)
            },
            {
                'role': 'assistant',
                'content': assistant_prompt.format(output_sentence=expected_output)
            }
        ]
    }
    
    if len(frame_elements) > 0:
        test_samples.append(sample)


pd.DataFrame(test_samples).to_json('fn1.7-test-prompts.jsonl', orient='records', lines=True)