# Make Qual Fine-Tuning Data

We have fine-tuning data in JSON, but it isn't in the right format and contains unnecessary fields. This puts it in the right format for submission to OAI.

In [23]:
import json
import textwrap
import random

In [24]:
pwd

'/Users/tunder/Library/CloudStorage/Dropbox/python/GPT-1914/anachronism/qualitative'

In [32]:
with open('clean_questions.json') as f:
    questions = json.load(f)

In [27]:
def print_wrapped_text(text, width=70):
    texts = text.split('\n')
    if len(texts) > 1:
        for t in texts:
            print_wrapped_text(t, width=70)

    else:
        text = texts[0]
        wrapper = textwrap.TextWrapper(width=width)
        wrapped_text = wrapper.fill(text)
        print(wrapped_text)

system_prompt = '''You are producing a simulation of early twentieth-century print culture.
Respond to all questions and instructions as if you were living in early 1914, using
only knowledge written down or believed at the time. If you're asked about a person, event, or idea that wouldn't have been discussed in print in 1914,
explain that you're unfamiliar with the term. 

You may be asked to complete stories. The proper names of some characters
may be changed, but that's not important; you'll recognize the plots.
If you are asked to complete a story that was published before 1914, complete it as originally written.
If the story was published after 1914, you would be unfamiliar with it, so
you must invent a different set of events that could complete the story--
without echoing any story published after 1914.

Some ideas and terms that are now unacceptable would have been normal in the early twentieth century. 
So let yourself be guided by the language and attitudes of the time. Also, since this
is meant to be a realistic simulation, avoid self-conscious remarks that give away
your later perspective--like, "Here in 1914, we still believe that ..." Simply
respond as if you were living in the early twentieth century. 
'''

system_prompt = system_prompt.replace('\n\n', '\t').replace('\n', ' ').replace('\t', '\n\n')
print_wrapped_text(system_prompt)

You are producing a simulation of early twentieth-century print
culture. Respond to all questions and instructions as if you were
living in early 1914, using only knowledge written down or believed at
the time. If you're asked about a person, event, or idea that wouldn't
have been discussed in print in 1914, explain that you're unfamiliar
with the term.

You may be asked to complete stories. The proper names of some
characters may be changed, but that's not important; you'll recognize
the plots. If you are asked to complete a story that was published
before 1914, complete it as originally written. If the story was
published after 1914, you would be unfamiliar with it, so you must
invent a different set of events that could complete the story--
without echoing any story published after 1914.

Some ideas and terms that are now unacceptable would have been normal
in the early twentieth century.  So let yourself be guided by the
language and attitudes of the time. Also, since this is meant

In [33]:
# We divide into train and validation sets based on the subj fields, so first we need 
# to ensure that all questions have a subj field. If they don't, we assign them one.

ctr = 0
allsubjs = set()
for q in questions:
    if 'subj' in q:
        allsubjs.add(q['subj'])
        
    else:
        ctr += 1
        q['subj'] = 'subj' + str(ctr)
        allsubjs.add(q['subj'])

In [58]:
def get_train_and_val_fold(train_subjs, validate_subjs, foldnum):
    # We start by dividing questions into train and validate sets. We want to make sure that
    # all questions about a particular subject are in the same set.
    global questions

    train = []
    validate = []
    fictions = ['frozen', 'hobbit', 'hungergames', 'starwars', 'wharton']
    for i in range(5):
        if i != foldnum:
            train_subjs.add(fictions[i])
        else:
            validate_subjs.add(fictions[foldnum])

    # We iterate through questions putting each into train with probability
    # 80% and into validate with probability 20%. However, if the subj field
    # is already in train_subjs, we automatically put it into train.

    target_proportion = 0.8

    # Since this will create a slight imbalance in the number of questions,
    # we dynamically adjust the probability of putting a question into train
    # based on the number of questions already in train.

    for q in questions:
        if q['subj'] in train_subjs:
            train.append(q)
        elif q['subj'] in validate_subjs:
            validate.append(q)
        else:
            if random.random() < target_proportion:
                train.append(q)
                train_subjs.add(q['subj'])
            else:
                validate.append(q)
                validate_subjs.add(q['subj'])
        balance = (len(train) + 1) / (len(train) + len(validate) + 2)
        if foldnum == 4:
            target_proportion = 0 # we want to fill the last fold with all the remaining questions
        elif len(validate) > 55:
            target_proportion = 1 # the validate set is full!
        elif balance < 0.784:
            target_proportion = 0.784 + (0.785 - balance)
        else:
            target_proportion = 0.784 - (balance - 0.785) * 6

    print(f'Train set has {len(train)} questions.')
    print(f'Validate set has {len(validate)} questions.')
    print(f'Balance is {balance}.')

    formatted_train = []
    formatted_validate = []

    # we need to put the system prompt, user, and assistant in this format:
    # {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
    # However, there may be multiple user and assistant messages in a row, and they
    # need to alternate.
    
    for q in train:
        message = {"messages": [{"role": "system", "content": system_prompt}]}
        assert len(q['user']) == len(q['assistant'])
        turns = len(q['user'])
        for i in range(turns):
            user_segment = q['user'][i]
            assistant_segment = q['assistant'][i]
            message["messages"].append({"role": "user", "content": user_segment})
            message["messages"].append({"role": "assistant", "content": assistant_segment})
        formatted_train.append(message)

    for q in validate:
        message = {"messages": [{"role": "system", "content": system_prompt}]}
        assert len(q['user']) == len(q['assistant'])
        turns = len(q['user'])
        for i in range(turns):
            user_segment = q['user'][i]
            assistant_segment = q['assistant'][i]
            message["messages"].append({"role": "user", "content": user_segment})
            message["messages"].append({"role": "assistant", "content": assistant_segment})
        formatted_validate.append(message)
    
    return formatted_train, formatted_validate, validate_subjs

In [34]:
allsubjs

{'atoms',
 'cholera',
 'churchill',
 'ethnology',
 'eugenics',
 'fitzgerald',
 'fossils',
 'frozen',
 'goblinmarket',
 'hobbit',
 'hungergames',
 'indochina',
 'radio',
 'sinojapanese',
 'starwars',
 'subj1',
 'subj10',
 'subj100',
 'subj101',
 'subj102',
 'subj103',
 'subj104',
 'subj105',
 'subj106',
 'subj107',
 'subj108',
 'subj109',
 'subj11',
 'subj110',
 'subj111',
 'subj112',
 'subj113',
 'subj114',
 'subj115',
 'subj116',
 'subj117',
 'subj118',
 'subj119',
 'subj12',
 'subj120',
 'subj121',
 'subj122',
 'subj123',
 'subj124',
 'subj125',
 'subj126',
 'subj127',
 'subj128',
 'subj129',
 'subj13',
 'subj130',
 'subj131',
 'subj132',
 'subj133',
 'subj134',
 'subj135',
 'subj136',
 'subj137',
 'subj138',
 'subj139',
 'subj14',
 'subj140',
 'subj141',
 'subj142',
 'subj143',
 'subj144',
 'subj145',
 'subj146',
 'subj147',
 'subj148',
 'subj149',
 'subj15',
 'subj150',
 'subj151',
 'subj152',
 'subj153',
 'subj154',
 'subj155',
 'subj156',
 'subj157',
 'subj158',
 'subj159',
 'sub

In [59]:
def write_jsonl(filename, jsonlist):
    with open(filename, 'w') as f:
        for j in jsonlist:
            f.write(json.dumps(j) + '\n')

train_subjs = set()
validate_subjs = set()
used_validates = set()

for i in range(5):

    train, val, validate_subjs = get_train_and_val_fold(train_subjs, validate_subjs, i)

    train_outfile = 'train_'+str(i) + '.jsonl'
    val_outfile = f'val_{i}.jsonl'
    write_jsonl(train_outfile, train)
    write_jsonl(val_outfile, val)

    print(f'Wrote train_{i}.jsonl and val_{i}.jsonl')
    print(f'Length of validate_subjs: {len(validate_subjs)}')
    for s in validate_subjs:
        used_validates.add(s)
    print(f'Validate subjs: {validate_subjs}')

    validate_subjs = set()
    train_subjs = set(used_validates)
    print(len(train_subjs))


Train set has 209 questions.
Validate set has 56 questions.
Balance is 0.7865168539325843.
Wrote train_0.jsonl and val_0.jsonl
Length of validate_subjs: 53
Validate subjs: {'subj94', 'cholera', 'subj220', 'subj222', 'subj92', 'subj61', 'subj190', 'subj128', 'subj50', 'subj75', 'subj18', 'subj179', 'subj224', 'subj37', 'subj49', 'subj169', 'subj207', 'subj232', 'subj122', 'subj136', 'subj22', 'subj38', 'subj135', 'subj102', 'subj140', 'subj31', 'subj172', 'subj173', 'subj211', 'subj213', 'subj93', 'subj82', 'subj97', 'subj120', 'subj229', 'subj64', 'subj155', 'subj14', 'subj15', 'subj6', 'subj185', 'frozen', 'subj73', 'subj88', 'subj161', 'subj27', 'subj137', 'subj32', 'subj163', 'subj76', 'subj175', 'subj192', 'subj104'}
53
Train set has 213 questions.
Validate set has 52 questions.
Balance is 0.8014981273408239.
Wrote train_1.jsonl and val_1.jsonl
Length of validate_subjs: 50
Validate subjs: {'subj42', 'subj5', 'subj149', 'subj159', 'subj126', 'subj60', 'subj101', 'subj43', 'subj3', '

In [22]:
with open('orig_format_val1.json', mode = 'w') as f:
    json.dump(validate, f)