# Make Fine-Tuning Dataset

In [48]:
import pandas as pd
import os, random
import nltk # For sentence tokenization
from nltk.tokenize import sent_tokenize
import SonicScrewdriver as utils
import tiktoken as tk

input_dir = 'forfinetuning'
# print working directory
print(os.getcwd())

tokenizer = tk.encoding_for_model('gpt-3.5-turbo')


/Users/tunder/Library/CloudStorage/Dropbox/python/GPT-1914/anachronism


In [52]:
input_files = [x for x in os.listdir(input_dir) if x.endswith('.txt')]

metadata = pd.read_csv('../metadata/20cmeta_plus_enum.tsv', sep='\t')

def dirty_pairtree(htid):
    period = htid.find('.')
    prefix = htid[0:period]
    postfix = htid[(period+1): ]
    if '=' in postfix:
        postfix = postfix.replace('+',':')
        postfix = postfix.replace('=','/')
    if ',' in postfix:
        postfix = postfix.replace(',','.')
    dirtyname = prefix + "." + postfix
    return dirtyname

def make_segments(afile):
    with open(afile, 'r') as f:
        text = f.read()
    
    text = text.replace('\n', ' ').replace('\r', ' ') # Remove newlines and carriage returns
    text = text.replace('\t', ' ').replace('<pb>', ' ') # Remove tabs and page breaks
    text = text.replace('  ', ' ') # Remove double spaces

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # we use tiktoken to tokenize the sentence
    # and record the length of each sentence in tokens
    sentence_lengths = []
    
    for s in sentences:
        tokens = tokenizer.encode(s)
        sentence_lengths.append(len(tokens))

    # We skip the first 10 sentences of the text, which may be front matter.
    # Then we select segments of at least 130 tokens that start and end with complete sentences,

    skipnumber = 10  # Skip the first three sentences of the text
    segments = []
    start = skipnumber
    use_segment = True

    while start < len(sentences) - 10:
        segment = ''
        segment_length = 0
        end = start
        while end < len(sentences) - 1:
            segment += sentences[end] + ' '
            segment_length += sentence_lengths[end]
            end += 1
            if segment_length > 130:
                break

        if segment_length > 130:
            segments.append(segment.strip())  
        start = end

    return segments


In [25]:
metadata.head()

Unnamed: 0,HTid,recordid,OCLC,LOCnum,author,imprint,datetype,startdate,enddate,textdate,place,enumcron,materialtype,subjects,genres,title,enum_start_date,enum_end_date,inferred_date
0,mdp.39015058525026,2530,166684,<blank>,"Elias, Frank,",London;A. and C. Black;1913.,|,1913,||||,1913,|||,<blank>,monograph,East Asia;Description and travel,UnknownGenre,"The gorgeous East: India, Burma, Ceylon, and Siam",,,1913
1,uc2.ark:/13960/t07w69098,2530,166684,<blank>,"Elias, Frank,",London;A. and C. Black;1913.,|,1913,||||,1913,|||,<blank>,monograph,East Asia;Description and travel,UnknownGenre,"The gorgeous East: India, Burma, Ceylon, and Siam",,,1913
2,mdp.39015002973645,3162,216282,CT275.B5957A3,"Braly, John Hyde.",Los Angeles;1912,s,1912,,1912,cau,<blank>,monograph,"Braly, John Hyde;IsBiographical",NotFiction;Autobiography,"Memory pictures, an autobiography",,,1912
3,mdp.39015003834085,4094,285988,<blank>,"Bradford, William,",Boston;published for the Massachusetts Histori...,|,1912,||||,1912,|||,v.1,monograph,"New Plymouth, 1620-1691;Massachusetts;History",UnknownGenre,"History of Plymouth plantation, 1620-1647",,,1912
4,mdp.39015003833764,4094,285988,<blank>,"Bradford, William,",Boston;published for the Massachusetts Histori...,|,1912,||||,1912,|||,v.2,monograph,"New Plymouth, 1620-1691;Massachusetts;History",UnknownGenre,"History of Plymouth plantation, 1620-1647",,,1912


In [None]:
def create_messages(list_of_files, metadata):
    messages = []

    for file in list_of_files:
        htid = dirty_pairtree(file.replace('.trim.txt', ''))
        if metadata.HTid.isin([htid]).any():
            date = metadata.loc[metadata.HTid == htid, 'inferred_date'].values[0]
            author = metadata.loc[metadata.HTid == htid, 'author'].values[0]
            title = metadata.loc[metadata.HTid == htid, 'title'].values[0]
            print(htid, date, author, title)
        else:
            print(htid, " not found in metadata")
            date = "1900"
            author = "Unknown"
            title = "Unknown"
        segments = make_segments(os.path.join(input_dir, file))
        system_prompt = f"Your task is to complete passages from early twentieth-century books while closely matching the style. \
You will be given a passage from a book published in {date}. Continue \
this passage in the same style for roughly 120 words. Only provide \
this continuation; do not make any framing remarks, like 'here is the \
continuation:'."
        system_prompt = system_prompt.replace('\n', ' ').replace('\t', ' ').replace('  ', ' ')
        # We randomly sample 20 segments from the list of segments,
        # and in each case take both the segment (as user) 
        # and the next segment (as assistant).
        if len(segments) < 10:
            print("Skipping ", htid, " because it has fewer than 10 segments.")
            continue
        elif len(segments) < 20:
            k = len(segments) - 3
        else:
            k = 20
        for i in range(k):
            start = random.randint(0, len(segments) - 2)
            end = start + 1
            user_segment = segments[start]
            assistant_segment = segments[end]

            # we need to put the system prompt, user, and assistant in this format:
            # {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
            message = {"messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_segment}, {"role": "assistant", "content": assistant_segment}]}
            messages.append(message)

    return messages

In [56]:
# randomly sample twenty input_files as the test set,
# and the rest as the training set

random.shuffle(input_files)
test_files = input_files[:20]
train_files = input_files[20:]

test_messages = create_messages(test_files, metadata)
train_messages = create_messages(train_files, metadata)

print(len(test_messages), len(train_messages))

loc.ark:/13960/t1hh7dt27 1914 Orsi, Pietro, Cavour and the making of modern Italy, 1810-1861
mdp.39015033593651 1902 Robertson, John George, A history of German literature
mdp.39015074763429 1909 <blank> Bulletin of Armour Institute of Technology
pst.000004400813 1904 Phillips, H. Joshua. Gold assaying
nyp.33433081802724 1911 Miller, Francis Trevelyan, The photographic history of the civil war
coo.31924097859510 0 <blank> Technical report
nc01.ark:/13960/t6b27zx12 1907 Lumpkin, Wilson, The removal of the Cherokee Indians from Georgia
uc1.$b576810 1913 Ashmead-Bartlett, Ellis, With the Turks in Thrace
uc2.ark:/13960/t05x27028 1900 Ouida, The waters of Edera
mdp.39015031974119 1906 Molloy, J. Fitzgerald Sir Joshua and his circle
uc1.$b361040 1907 Hearn, George A., Collection of watches loaned to the Metropolitan museum of art of the city of New York
loc.ark:/13960/t8cf9vh68 1904 Mitchell, H. E. The Winthrop register, 1903-4
uva.x030515061 1902 Stiles, Charles Wardell, Index-catalogue of 

In [30]:
htidlist = metadata.HTid.tolist()
print('nyp.33433067368013' in htidlist, 'we can find it in the list')
print('nyp.33433067368013' in metadata.HTid, 'but not in the dataframe column. Why?')

True we can find it in the list
False but not in the dataframe column. Why?


In [57]:
import json

# Write test_messages to a JSONL file
with open('finetuningdata/test_messages.jsonl', 'w') as f:
    for message in test_messages:
        f.write(json.dumps(message) + '\n')

# Write train_messages to a JSONL file
with open('finetuningdata/train_messages.jsonl', 'w') as f:
    for message in train_messages:
        f.write(json.dumps(message) + '\n')

In [21]:
len(input_files)

120

In [39]:
test_files

['uc2.ark+=13960=t0cv4j21b.trim.txt',
 'pst.000057655673.trim.txt',
 'wu.89092546217.trim.txt',
 'nnc1.cr00454028.trim.txt',
 'uc1.$b766087.trim.txt',
 'mdp.39015039600534.trim.txt',
 'uc2.ark+=13960=t2n58fq8n.trim.txt',
 'uc2.ark+=13960=t9r20vv3s.trim.txt',
 'mdp.39015078698563.trim.txt',
 'wu.89062856943.trim.txt',
 'uc2.ark+=13960=t7br8pn9k.trim.txt',
 'inu.30000108609532.trim.txt',
 'coo.31924064977493.trim.txt',
 'chi.73615062.trim.txt',
 'ncs1.ark+=13960=t4bp03v1t.trim.txt',
 'mdp.39015003295584.trim.txt',
 'yale.39002002885888.trim.txt',
 'nyp.33433066333976.trim.txt',
 'mdp.39015086690230.trim.txt',
 'loc.ark+=13960=t3cz3wn8z.trim.txt']