# Create test set

In the file `explore_data.ipynb`, I found that a token cutoff of 3000 had over 95% of all tokens and articles. I also found that a cutoff of 30% symbol tokens captures about 90% of tokens and articles. In order to have a test set that is not to onerous to produce or too poor quality to recover I am going to combine both of these metrics

In [2]:
import pandas as pd
import tiktoken
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time  # Make sure to import the time module
import shutil
import PyPDF2

from helper_functions import identify_file, find_pdf_path, extract_pages_from_pdf, process_pdfs


from dotenv import load_dotenv, find_dotenv
load_dotenv()

directory = './data/ncse_text_chunks'

dev_transcripts = 'data/dev_data_transcript'

dev_gpt4_results = 'data/dev_data_gpt-4-turbo-preview'
dev_gpt3_results = 'data/dev_data_gpt-3.5-turbo'

image_path = os.getenv("image_path")



ModuleNotFoundError: No module named 'helper_functions'

Load the files individually filtering to ensure only the values valid for testing are included.

In [None]:
page_list = pd.read_parquet('data/periodicals_page.parquet')

periodicals_issue = pd.read_parquet('data/periodicals_issue.parquet')

periodicals_publication = pd.read_parquet('data/periodicals_publication.parquet')

In [None]:
page_info = page_list.merge(
    periodicals_issue[['id', 'publication_id', 'issue_date']].rename(columns = {'id':'issue_id'}), on = 'issue_id'
).merge(
   periodicals_publication[['id', 'slug', 'title']].rename(columns = {'id':'publication_id'}), on='publication_id' 
)

page_info

Unnamed: 0,id,height,number,image,width,issue_id,article_count,label,publication_id,issue_date,slug,title
0,170269,2270,6,periodicals/101-NS5-1842-10-29-PG001-SINGLE/Pg...,1579,5747,9,6,27,1842-10-29,ns,Northern Star (1837-1852)
1,170270,2267,7,periodicals/101-NS5-1842-10-29-PG001-SINGLE/Pg...,1583,5747,13,7,27,1842-10-29,ns,Northern Star (1837-1852)
2,170271,2268,8,periodicals/101-NS5-1842-10-29-PG001-SINGLE/Pg...,1583,5747,14,8,27,1842-10-29,ns,Northern Star (1837-1852)
3,170264,2355,1,periodicals/101-NS5-1842-10-29-PG001-SINGLE/Pg...,1619,5747,18,unpag,27,1842-10-29,ns,Northern Star (1837-1852)
4,170265,2275,2,periodicals/101-NS5-1842-10-29-PG001-SINGLE/Pg...,1583,5747,7,2,27,1842-10-29,ns,Northern Star (1837-1852)
...,...,...,...,...,...,...,...,...,...,...,...,...
97694,153495,1433,72,periodicals/099-TEC-1890-04-01-001-SINGLE/Pg07...,922,4349,1,434,26,1890-04-01,pc,Publishers’ Circular (1880-1890)
97695,153496,1433,73,periodicals/099-TEC-1890-04-01-001-SINGLE/Pg07...,922,4349,1,435,26,1890-04-01,pc,Publishers’ Circular (1880-1890)
97696,153497,1433,74,periodicals/099-TEC-1890-04-01-001-SINGLE/Pg07...,922,4349,2,436,26,1890-04-01,pc,Publishers’ Circular (1880-1890)
97697,153498,1433,75,periodicals/099-TEC-1890-04-01-001-SINGLE/Pg07...,922,4349,2,iii,26,1890-04-01,pc,Publishers’ Circular (1880-1890)


In [None]:

np.random.seed(1842)
test_set = page_info.groupby('title').sample(10)



Unnamed: 0,id,height,number,image,width,issue_id,article_count,label,publication_id,issue_date,slug,title
73538,92283,1240,67,periodicals/041-EWJ-1859-02-01-001-SINGLE/Pg06...,817,2814,3,427,24,1859-02-01,ewj,English Woman’s Journal (1858-1864)
73723,92889,1222,29,periodicals/041-EWJ-1860-03-01-001-SINGLE/Pg02...,828,2823,1,29,24,1860-03-01,ewj,English Woman’s Journal (1858-1864)
75301,94900,1231,24,periodicals/041-EWJ-1862-02-01-001-SINGLE/Pg02...,802,2851,3,384,24,1862-02-01,ewj,English Woman’s Journal (1858-1864)
71257,90939,1244,25,periodicals/041-EWJ-1858-09-01-001-SINGLE/Pg02...,791,2796,1,25,24,1858-09-01,ewj,English Woman’s Journal (1858-1864)
75831,95438,1245,55,periodicals/041-EWJ-1863-02-02-001-SINGLE/Pg05...,805,2858,1,415,24,1863-02-02,ewj,English Woman’s Journal (1858-1864)
73888,93132,1243,53,periodicals/041-EWJ-1860-04-01-001-SINGLE/Pg05...,805,2826,1,125,24,1860-04-01,ewj,English Woman’s Journal (1858-1864)
72679,93872,1241,71,periodicals/041-EWJ-1861-10-01-001-SINGLE/Pg07...,820,2836,1,143,24,1861-10-01,ewj,English Woman’s Journal (1858-1864)
71970,92552,1228,4,periodicals/041-EWJ-1860-05-01-001-SINGLE/Pg00...,793,2818,1,148,24,1860-05-01,ewj,English Woman’s Journal (1858-1864)
76257,95935,1242,49,periodicals/041-EWJ-1863-04-01-001-SINGLE/Pg04...,837,2865,3,121,24,1863-04-01,ewj,English Woman’s Journal (1858-1864)
74645,94098,1246,15,periodicals/041-EWJ-1861-04-01-001-SINGLE/Pg01...,806,2840,1,87,24,1861-04-01,ewj,English Woman’s Journal (1858-1864)


In [None]:

df_list = []

for filename in os.listdir(directory):

    file_path = os.path.join(directory, filename)

    df = pd.read_parquet(file_path)
    
    df = df.loc[(df['total_tokens']>=100) & (df['total_tokens']<=3000) & (df['symbol_fract']<0.3) & 
                (df['article_type_id']!=3), :]
    df.drop(columns=[ 'continuation_from_id', 'continuation_to_id'], inplace=True)

    df_list.append(df)

df = pd.concat(df_list, ignore_index=True)

Create the sampling function

In [None]:
def sample_data(df, num_articles = 20, num_adverts = 25, random_seed = 1842):

    np.random.seed(random_seed)

    # Sample 20 when article_type == 1
    sampled_df_1 = df[df['article_type_id'] == 1].groupby(['article_type_id', 'publication_id']).sample(n=num_articles, replace=False)

    # Sample 25 when article_type == 2
    sampled_df_2 = df[df['article_type_id'] == 2].groupby(['article_type_id', 'publication_id']).sample(n=num_adverts, replace=False)

    # Concatenate the sampled DataFrames
    test_data = pd.concat([sampled_df_1, sampled_df_2], ignore_index=True)

    # Reset index
    test_data.reset_index(drop=True, inplace=True)

    return test_data

Generate the different datasets, subsetting each time so there is no overlap. Random seed ensures reproducibility

In [None]:

test_data = sample_data(df,  num_articles = 20, num_adverts = 25, random_seed = 1842)

#this is tiny as I have to transcribe this myself and is just used for tuning the prompt
dev_data = sample_data(df.loc[~df['id'].isin(test_data.id),:], num_articles = 2, num_adverts = 3, random_seed = 1842)

#This is used for creating the silver label train set with the final prompt on gpt-4/opus
train_data = sample_data(df.loc[~df['id'].isin(test_data.id.to_list() +dev_data.id.to_list()),:], num_articles = 100, num_adverts = 125, random_seed = 1842)

test_data.to_csv('./data/test_data_raw.csv')
dev_data.to_csv('./data/dev_data_raw.csv')
train_data.to_csv('./data/train_data_raw.csv')
del df

# Prepare test set for transcribing

The test set needs to be cleaned up to make it easier to be used for transcribing. This includes subsetting the columns, and getting the name of the file so that it can be loaded. The file names come from the database and seem a bit annoying as there doesn't seem to be an obvious pattern

In [None]:
folder2id_df = pd.DataFrame([{'folder':'English_Womans_Journal_issue_PDF_files', 'publication_id':24},
 {'folder':'Leader_issue_PDF_files/Leader_issue_PDF_files', 'publication_id':20},
 {'folder':'Monthly_Repository_issue_PDF_files', 'publication_id':22},
 {'folder':'Northern_Star_issue_PDF_files', 'publication_id':27},
 {'folder':'Publishers_Circular_issue_PDF_files', 'publication_id':26},
 {'folder':'Tomahawk_issue_PDF_files/Tomahawk_issue_PDF_files', 'publication_id':19}])

In [None]:
#create folders of the pdfs for the test and dev set

temp = test_data.merge(folder2id_df, on = 'publication_id')

temp['pdf_path'] =temp.apply(lambda row: find_pdf_path(image_path, row['folder'],  row['issue_date'].strftime('%Y-%m-%d')), axis=1)

process_pdfs(temp, output_folder="data/test_pdfs_for_transcription", 1)

temp[['content_html', 'file_name', 'total_tokens', 'issue_date', 'page_number', 'pdf_path','site_address']].to_csv('./data/test_data_transcription.csv')


temp = dev_data.merge(folder2id_df, on = 'publication_id')

temp['pdf_path'] =temp.apply(lambda row: find_pdf_path(image_path, row['folder'],  row['issue_date'].strftime('%Y-%m-%d')), axis=1)

process_pdfs(temp, output_folder="data/dev_pdfs_for_transcription", 1)

In [None]:
temp[['content_html', 'file_name', 'total_tokens', 'issue_date', 'page_number', 'pdf_path','site_address']].to_csv('./data/dev_data_transcription.csv')


In [None]:
test_data['total_tokens'].sum()*0.8

NameError: name 'test_data' is not defined