# Create test set

In the file `explore_data.ipynb`, I found that a token cutoff of 3000 had over 95% of all tokens and articles. I also found that a cutoff of 30% symbol tokens captures about 90% of tokens and articles. In order to have a test set that is not to onerous to produce or too poor quality to recover I am going to combine both of these metrics

Need to do
- Load all articles, keep only the number of tokens, the page and the number of symbols
- sum to page level
- calc fract symbols
- review distribution
- minimum token count, maximum symbol fract
- subset pages
- sample

In [1]:
import pandas as pd
import tiktoken
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time  # Make sure to import the time module
import shutil
import PyPDF2

from helper_functions import identify_file, find_pdf_path, extract_pages_from_pdf, process_pdfs, stratified_target_sampling


from dotenv import load_dotenv, find_dotenv
load_dotenv()

directory = './data/ncse_text_chunks'

dev_transcripts = 'data/dev_data_transcript'

dev_gpt4_results = 'data/dev_data_gpt-4-turbo-preview'
dev_gpt3_results = 'data/dev_data_gpt-3.5-turbo'

image_path = os.getenv("image_path")



Get the conversion from the publication ID to the folder path of the archived images from the NCSE figshare

In [2]:
folder2id_df = pd.DataFrame([{'folder':'English_Womans_Journal_issue_PDF_files', 'publication_id':24},
 {'folder':'Leader_issue_PDF_files/Leader_issue_PDF_files', 'publication_id':20},
 {'folder':'Monthly_Repository_issue_PDF_files', 'publication_id':22},
 {'folder':'Northern_Star_issue_PDF_files', 'publication_id':27},
 {'folder':'Publishers_Circular_issue_PDF_files', 'publication_id':26},
 {'folder':'Tomahawk_issue_PDF_files/Tomahawk_issue_PDF_files', 'publication_id':19}])

In [3]:
page_list = pd.read_parquet('data/periodicals_page.parquet')

periodicals_issue = pd.read_parquet('data/periodicals_issue.parquet')

periodicals_publication = pd.read_parquet('data/periodicals_publication.parquet')

page_info = page_list.rename(columns = {'number':'page_number'}).merge(
    periodicals_issue[['id', 'publication_id', 'issue_date']].rename(columns = {'id':'issue_id'}), on = 'issue_id'
).merge(
   periodicals_publication[['id', 'slug', 'title']].rename(columns = {'id':'publication_id'}), on='publication_id' 
)


In [4]:

df_list = []

for filename in os.listdir(directory):

    file_path = os.path.join(directory, filename)

    df = pd.read_parquet(file_path)
    df = df.loc[:, ['publication_id', 'page_number','total_tokens', 'symbol_count', 'issue_id']]

    df_list.append(df)

df = pd.concat(df_list, ignore_index=True)

df = df.groupby(['publication_id', 'issue_id', 'page_number']).sum().reset_index()

df['symbol_fract'] = df['symbol_count']/df['total_tokens']


df = df.merge(folder2id_df, on = 'publication_id').merge(page_info, on = ['issue_id', 'page_number', 'publication_id'])

df['pdf_path'] = df.apply(lambda row: find_pdf_path(image_path, row['folder'],  row['issue_date'].strftime('%Y-%m-%d')), axis=1)

In [7]:
#The percentiles are very different depending on the publication indicating the differences in style
df.groupby('publication_id')[['total_tokens','symbol_fract']].quantile([0.1, 0.9])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_tokens,symbol_fract
publication_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19,0.1,0.0,0.135103
19,0.9,2905.5,0.451432
20,0.1,2590.0,0.128776
20,0.9,4814.0,0.247116
22,0.1,628.0,0.116314
22,0.9,1302.0,0.250726
24,0.1,615.0,0.103989
24,0.9,906.0,0.175439
26,0.1,563.0,0.236244
26,0.9,3804.9,0.547052


In [8]:
df.groupby('publication_id')[['total_tokens','symbol_fract']].describe()

Unnamed: 0_level_0,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,symbol_fract,symbol_fract,symbol_fract,symbol_fract,symbol_fract,symbol_fract,symbol_fract,symbol_fract
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
publication_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
19,1896.0,1621.028481,1134.921656,0.0,457.75,1838.5,2320.25,7086.0,1652.0,0.263881,0.17928,0.0,0.162619,0.212217,0.298664,2.657534
20,24448.0,3618.727544,883.722034,33.0,3065.75,3513.0,3990.0,13202.0,24448.0,0.175944,0.053174,0.094188,0.140544,0.158967,0.198812,0.630442
22,26743.0,952.304865,312.110114,0.0,726.0,935.0,1094.0,3259.0,26727.0,0.17281,0.06773,0.0,0.13204,0.154784,0.190946,1.802555
24,5663.0,730.57655,134.483417,0.0,670.0,715.0,773.0,1866.0,5661.0,0.139435,0.045759,0.069291,0.115987,0.131054,0.149864,0.598712
26,20822.0,2001.118913,1368.793948,0.0,948.0,1593.0,2925.75,9443.0,20812.0,0.371602,0.148802,0.0,0.275452,0.331159,0.430598,2.313433
27,17843.0,14113.945861,3315.907047,0.0,12989.0,14656.0,15783.0,44971.0,17838.0,0.186973,0.040478,0.107606,0.161502,0.178249,0.201454,0.672438


In [15]:
df2 = df.copy()
percentile_90 = df2.groupby('publication_id')['symbol_fract'].transform(lambda x: x.quantile(0.9))
df2 = df2.loc[(df2['total_tokens']>500) & (df2['symbol_fract'] <= percentile_90) & (df2['pdf_path'].notnull()),: ].reset_index(drop=True)

np.random.seed(1842)
pages_for_transcription_1 = stratified_target_sampling(df2, group_col = 'publication_id', value_col = 'total_tokens', target_value = 20000)
pages_for_transcription_1

Unnamed: 0,publication_id,issue_id,page_number,total_tokens,symbol_count,symbol_fract,folder,id,height,image,width,article_count,label,issue_date,slug,title,pdf_path
357,19,4454,1,1310,182,0.138931,Tomahawk_issue_PDF_files/Tomahawk_issue_PDF_files,159322,1783,periodicals/041-TTW-1869-01-30-PG001-SINGLE/Pg...,1527,4,unpag,1869-01-30,t,Tomahawk (1867-1870),/media/jonno/ncse/Tomahawk_issue_PDF_files/Tom...
341,19,4451,9,2594,517,0.199306,Tomahawk_issue_PDF_files/Tomahawk_issue_PDF_files,159298,2069,periodicals/041-TTW-1869-12-18-PG001-SINGLE_Re...,1854,4,281,1869-12-18,t,Tomahawk (1867-1870),/media/jonno/ncse/Tomahawk_issue_PDF_files/Tom...
907,19,4527,11,2325,327,0.140645,Tomahawk_issue_PDF_files/Tomahawk_issue_PDF_files,160258,1352,periodicals/041-TTW-1867-12-21-PG001-SINGLE/Pg...,1159,4,347,1867-12-21,t,Tomahawk (1867-1870),/media/jonno/ncse/Tomahawk_issue_PDF_files/Tom...
186,19,4432,12,1591,195,0.122564,Tomahawk_issue_PDF_files/Tomahawk_issue_PDF_files,159049,1379,periodicals/041-TTW-1869-07-24-PG001-SINGLE/Pg...,1104,16,42,1869-07-24,t,Tomahawk (1867-1870),/media/jonno/ncse/Tomahawk_issue_PDF_files/Tom...
40,19,4415,12,2369,452,0.190798,Tomahawk_issue_PDF_files/Tomahawk_issue_PDF_files,158831,2044,periodicals/041-TTW-1869-02-13-PG001-SINGLE_Re...,1724,8,76,1869-02-13,t,Tomahawk (1867-1870),/media/jonno/ncse/Tomahawk_issue_PDF_files/Tom...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59996,26,4274,90,4477,2041,0.455886,Publishers_Circular_issue_PDF_files,147555,1428,periodicals/099-TEC-1889-11-01-001-SINGLE/Pg09...,1003,1,1466,1889-11-01,pc,Publishers’ Circular (1880-1890),/media/jonno/ncse/Publishers_Circular_issue_PD...
68708,26,4408,24,4610,2270,0.492408,Publishers_Circular_issue_PDF_files,158079,1542,periodicals/041-TTEC-1890-12-31-001-SINGLE/Pg0...,971,0,22,1890-12-31,pc,Publishers’ Circular (1880-1890),/media/jonno/ncse/Publishers_Circular_issue_PD...
55857,26,4207,52,1858,509,0.273950,Publishers_Circular_issue_PDF_files,142603,1435,periodicals/099-TEC-1888-08-15-001-SINGLE/Pg05...,983,1,954,1888-08-15,pc,Publishers’ Circular (1880-1890),/media/jonno/ncse/Publishers_Circular_issue_PD...
59007,26,4260,8,2525,807,0.319604,Publishers_Circular_issue_PDF_files,146451,1482,periodicals/099-TEC-1889-05-01-001-SINGLE/Pg00...,1011,3,462,1889-05-01,pc,Publishers’ Circular (1880-1890),/media/jonno/ncse/Publishers_Circular_issue_PD...


In [17]:
pages_for_transcription_1['file_name'] = pages_for_transcription_1['pdf_path'].apply(os.path.basename)
process_pdfs(pages_for_transcription_1, output_folder="data/pdfs_for_transcription")