In [1]:
import pandas as pd

frame_descriptions = pd.read_csv('frame_descriptions_json.csv')

system_prompt = """### Task:
You are given a sentence and a frame with its associated frame elements and sometimes examples. Your task is to label the frame elements in the sentence using JSON. Keys should only be one of the defined frame elements. Do not make up your own frame elements, and do not remove or change the input in any way. Identify the frame elements based on the highlighted target word. 

### Notes:
- Return the tagged sentence in a ```json ``` code block.
- Texts must not overlap.
"""

user_prompt = """### Frame Information:
{frame_info}

### Input:
{input_sentence}
"""

assistant_prompt = """### Output: 
```json
{output_sentence}
```
"""


In [2]:
print(frame_descriptions.values[0][1])

Frame Name: Process_continue
Frame Definition: An Event continues at a certain Place through Time. (Note that often when 'continue.v' occurs with definite time points, it denotes Resumption, which is out of frame here.)

Frame Elements:
Depictive (Extra-Thematic): Depictive phrase about the state of a focal participant.
Manner (Peripheral): Manner in which the Event is continuing.
Duration (Peripheral): This FE identifies the Duration of time over which the Event is ongoing.
Event (Core-Unexpressed): Name of the Event which is continuing.
Place (Peripheral): Where the Event takes place.
Explanation (Extra-Thematic): The Explanation for which the Event occurs.
Time (Peripheral): When the Event occurs.
Next_subevent (Extra-Thematic): The Next_subevent of the overall Event.
Circumstances (Extra-Thematic): Circumstances describe the state of the world (at a particular time and place) which is specifically independent of the event itself and any of its participants.
Concessive (Extra-Themat

## Train Data

In [3]:
import pickle

with open('../../data/raw/os_train.pkl', 'rb') as f:
    train_data = pickle.load(f)


In [4]:
# Create dicts for each sample:
# - input_sentence w/ target span surrounded with ** for highlighting
# - frame_name
# - frame_elements (as text, not spans)

def get_json_output(text, frame_elements):
    sorted_fes = sorted(frame_elements.items(), key=lambda x: text.find(x[1]))
    
    sorted_fes = dict(sorted_fes)
    
    return sorted_fes

test_samples = []
frame_descriptions_dict = frame_descriptions.set_index('name').to_dict()['description']

for row in train_data.iterrows():
    # Index(['target', 'text', 'tokens', 'lu', 'frame', 'fe'], dtype='object')
    idx, data = row
    
    # Get input sentence
    input_sentence = data['text'][:data['target'][0]] + '**' + data['text'][data['target'][0]:data['target'][1]] + '**' + data['text'][data['target'][1]:]
    
    # Get frame name
    frame_name = data['frame']
    
    # Get frame elements
    frame_elements = {}
    for fe in data['fe']:
        frame_elements[fe[0]] = data['text'][fe[1]:fe[2]]
        
    # Get expected output
    expected_output = get_json_output(data['text'], frame_elements)

    sample = {
        'messages': [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_prompt.format(frame_info=frame_descriptions_dict[frame_name].strip(), input_sentence=input_sentence)
            },
            {
                'role': 'assistant',
                'content': assistant_prompt.format(output_sentence=expected_output)
            }
        ]
    }
    
    if len(frame_elements) > 0:
        test_samples.append(sample)


pd.DataFrame(test_samples).to_json('fn1.7-train-prompts.jsonl', orient='records', lines=True)

## Val Data

In [5]:
import pickle

with open('../../data/raw/os_dev.pkl', 'rb') as f:
    val_data = pickle.load(f)


In [6]:
# Create dicts for each sample:
# - input_sentence w/ target span surrounded with ** for highlighting
# - frame_name
# - frame_elements (as text, not spans)

def get_json_output(text, frame_elements):
    sorted_fes = sorted(frame_elements.items(), key=lambda x: text.find(x[1]))
    
    sorted_fes = dict(sorted_fes)
    
    return sorted_fes

test_samples = []
frame_descriptions_dict = frame_descriptions.set_index('name').to_dict()['description']

for row in val_data.iterrows():
    # Index(['target', 'text', 'tokens', 'lu', 'frame', 'fe'], dtype='object')
    idx, data = row
    
    # Get input sentence
    input_sentence = data['text'][:data['target'][0]] + '**' + data['text'][data['target'][0]:data['target'][1]] + '**' + data['text'][data['target'][1]:]
    
    # Get frame name
    frame_name = data['frame']
    
    # Get frame elements
    frame_elements = {}
    for fe in data['fe']:
        frame_elements[fe[0]] = data['text'][fe[1]:fe[2]]
        
    # Get expected output
    expected_output = get_json_output(data['text'], frame_elements)

    sample = {
        'messages': [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_prompt.format(frame_info=frame_descriptions_dict[frame_name].strip(), input_sentence=input_sentence)
            },
            {
                'role': 'assistant',
                'content': assistant_prompt.format(output_sentence=expected_output)
            }
        ]
    }
    
    if len(frame_elements) > 0:
        test_samples.append(sample)

pd.DataFrame(test_samples).head(200).to_json('fn1.7-val-prompts.jsonl', orient='records', lines=True)
# pd.DataFrame(test_samples).to_json('fn1.7-val-prompts.jsonl', orient='records', lines=True)

## Test Data

In [7]:
import pickle

with open('../../data/raw/os_test.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [8]:
# Create dicts for each sample:
# - input_sentence w/ target span surrounded with ** for highlighting
# - frame_name
# - frame_elements (as text, not spans)

def get_json_output(text, frame_elements):
    sorted_fes = sorted(frame_elements.items(), key=lambda x: text.find(x[1]))
    
    sorted_fes = dict(sorted_fes)
    
    return sorted_fes

test_samples = []
frame_descriptions_dict = frame_descriptions.set_index('name').to_dict()['description']

for row in test_data.iterrows():
    # Index(['target', 'text', 'tokens', 'lu', 'frame', 'fe'], dtype='object')
    idx, data = row
    
    # Get input sentence
    input_sentence = data['text'][:data['target'][0]] + '**' + data['text'][data['target'][0]:data['target'][1]] + '**' + data['text'][data['target'][1]:]
    
    # Get frame name
    frame_name = data['frame']
    
    # Get frame elements
    frame_elements = {}
    for fe in data['fe']:
        frame_elements[fe[0]] = data['text'][fe[1]:fe[2]]
        
    # Get expected output
    expected_output = get_json_output(data['text'], frame_elements)

    sample = {
        'messages': [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_prompt.format(frame_info=frame_descriptions_dict[frame_name].strip(), input_sentence=input_sentence)
            },
            {
                'role': 'assistant',
                'content': assistant_prompt.format(output_sentence=expected_output)
            }
        ]
    }
    
    if len(frame_elements) > 0:
        test_samples.append(sample)


pd.DataFrame(test_samples).to_json('fn1.7-test-prompts.jsonl', orient='records', lines=True)