## Get the trove dataset

In [1]:
import os
import requests
#from bs4 import BeautifulSoup
from helper_functions import get_tokens_symbols, files_to_df_core_func
import tiktoken
import pandas as pd
import re

from overproof_helpers import process_txt_file, save_processed_data, save_processed_data_line

overproof_base = "data/overproof"

smh_folder = os.path.join(overproof_base, 'SMH')
ca_folder = os.path.join(overproof_base, 'CA')

file_download_folder = os.path.join(overproof_base, 'downloaded_files')


# Process dataset1

Dataset 1 is not very good accoring to the overproof team, however it is useful as a comparison

In [5]:

#This downloads all the dataset1 files from the overproof website

# URL containing the .txt files
url = "https://dlp2.pdst.ie/datasets/dataset1/rawTextAndHumanCorrectionPairs/"

# Create the "overproof" folder if it doesn't exist
if not os.path.exists(overproof_base):
    os.makedirs(overproof_base)

# Send a GET request to the URL
response = requests.get(url,
                     verify = False)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all the links to .txt files
txt_links = [link["href"] for link in soup.find_all("a") if link["href"].endswith(".txt")]

# Download each .txt file and save it in the "overproof" folder
for link in txt_links:
    file_url = url + link
    file_name = os.path.join(file_download_folder, link)
    
    # Download the file
    file_response = requests.get(file_url, verify=False)
    
    # Save the file in the "overproof" folder
    with open(file_name, "wb") as file:
        file.write(file_response.content)




In [3]:
year_files = os.listdir(os.path.join(smh_folder, 'dataset_1'))

In [12]:

dataset_1_list = []

for file in year_files:

    temp = process_txt_file(os.path.join(smh_folder,'dataset_1', file), ['raw', 'corrected'])

    dataset_1_list.append(temp)

save_processed_data(temp, os.path.join(smh_folder, 'dataset1_article_level'), ['raw', 'corrected'] )

In [16]:
overproof_raw = files_to_df_core_func(os.path.join(smh_folder, 'dataset1_article_level', 'corrected'))
enc = tiktoken.encoding_for_model("gpt-3.5-turbo") 
overproof_raw['word_count'] = overproof_raw['content'].apply(lambda x: len(str(x).split()))
overproof_raw['tokens'] = overproof_raw['content'].apply(lambda row: len(enc.encode(row)))
overproof_raw['year'] = overproof_raw['file_name'].str.extract(r'_year_(\d{4})', expand=False).astype(int)

In [20]:
overproof_raw

Unnamed: 0,file_name,content,word_count,tokens,year
0,14038989_year_1896_type_Article_title_The_Sydn...,THE MOVEMENT BY THE STRATHFIELD\nCOUNCIL.\nThe...,130,185,1896
1,14037364_year_1896_type_Article_title_The_Sydn...,NAMES OF THE MISSING.\nThe greatest difficulty...,340,545,1896
2,14070806_year_1896_type_Article_title_The_Sydn...,BANKSTOWN.\nThe fortnightly meeting of this co...,233,345,1896
3,14058158_year_1896_type_Article_title_The_Sydn...,"THE HANNAH NICHOLSON IN DISTRESS.\nThe barque,...",495,663,1896
4,14047181_year_1896_type_Article_title_The_Sydn...,ARRANGEMENTS FOR THE FUNERAL.\nSERVICE TO BE H...,229,366,1896
...,...,...,...,...,...
542,14067427_year_1896_type_Article_title_The_Sydn...,PASTORAL INTELLIGENCE.\n(FROM OUR CORRESPONDEN...,158,341,1896
543,14060983_year_1896_type_Article_title_The_Sydn...,AQUATICS.\nTHE LATE MR. E. M. DIETRICH.\nThe d...,149,225,1896
544,14064770_year_1896_type_Article_title_The_Sydn...,"BREACH OF PROMISE CASE.\nMELBOURNE, Tuesday.\n...",234,341,1896
545,14055213_year_1896_type_Article_title_The_Sydn...,WHAT MR. RUSSELL SAYS.\nSOME PARTICULARS OF TH...,320,492,1896


# Process Dataset 2

This assumes you have manually downloaded the dataet2, it is accidently hidden and can be found using the index of the website

http://overproof.projectcomputing.com/datasets/

The dataset can be downloaded from this url

http://overproof.projectcomputing.com/datasets/dataset2/rawTextAndHumanCorrectionAndOverproofCorrectionTriples/allArticles.txt

In [5]:


temp = process_txt_file(os.path.join(smh_folder, 'allArticles.txt'), ['raw', 'corrected', 'overproof'])

save_processed_data(temp, os.path.join(smh_folder, 'article_level2'), ['raw', 'corrected', 'overproof'] )

save_processed_data_line(temp, os.path.join(smh_folder, 'line_level2'))

In [12]:
temp = process_txt_file(os.path.join(ca_folder, 'allArticles.txt'), ['raw', 'corrected', 'overproof'])


# The meta data has a url in it which is removed here
url_pattern = re.compile(r'\s+https?://\S+')

# Update the 'metadata' value in each dictionary
temp = [
    {**d, 'metadata': url_pattern.sub('', d['metadata']).strip()}
    for d in temp 
]


save_processed_data(temp, os.path.join(ca_folder, 'article_level'), ['raw', 'corrected', 'overproof'] )

In [10]:
temp[0]['metadata']

'1 year 1871 type Article title THE CARIRO DAILY (ILLINOIS) http://chroniclingamerica.loc.gov/data/batches/batch_iune_kilo_ver01/data/sn88074142/00280761485/1871062101/0216.pdf'

In [4]:
overproof_raw = files_to_df_core_func(os.path.join(overproof_base, 'corrected'))
enc = tiktoken.encoding_for_model("gpt-3.5-turbo") 
overproof_raw['word_count'] = overproof_raw['content'].apply(lambda x: len(str(x).split()))
overproof_raw['tokens'] = overproof_raw['content'].apply(lambda row: len(enc.encode(row)))
overproof_raw['year'] = overproof_raw['file_name'].str.extract(r'_year_(\d{4})', expand=False).astype(int)


In [5]:
year_counts = overproof_raw.drop(columns=['content', 'file_name']).groupby('year').sum().reset_index()
year_counts

Unnamed: 0,year,word_count,tokens
0,1843,341,643
1,1845,445,894
2,1850,471,636
3,1851,429,877
4,1852,211,298
...,...,...,...
72,1949,282,416
73,1950,1546,2225
74,1951,1153,1826
75,1952,213,338


In [76]:
year_counts.sum()

year          146774
word_count     52640
tokens         80992
dtype: int64

In [77]:
overproof_raw

Unnamed: 0,file_name,content,word_count,tokens,year
0,16856000_year_1932_type_Article_title_The_Sydn...,FOOTBALL.\nRUGBY UNION.\nPremiership Opening.\...,502,727,1932
1,14219000_year_1899_type_Article_title_The_Sydn...,"ENTHUSIASTIC MEETING AT IPSWICH.\nBRISBANE, Th...",86,138,1899
2,15344000_year_1912_type_Article_title_The_Sydn...,PUBLIC LIBRARIAN.\n-»-\nOFFER TO MR. IFOULD.\n...,346,487,1912
3,28366000_year_1885_type_Article_title_The_Sydn...,"MR. D. RYRIE, M.L.A., AT BOMBALA.\n[BY TELEGRA...",278,387,1885
4,14656000_year_1904_type_Article_title_The_Sydn...,A GOLD-BEARING REEF.\nDISCOVERED AT EUCHAREENA...,281,406,1904
...,...,...,...,...,...
154,27945000_year_1941_type_Article_title_The_Sydn...,EARTHQUAKE NEAR\nALICE SPRINGS.\nRECORDED IN S...,160,231,1941
155,16443000_year_1928_type_Article_title_The_Sydn...,MR. ALEXANDER VEITCH.\nMany of the bowlers and...,127,184,1928
156,18235000_year_1951_type_Article_title_The_Sydn...,"Magistrate\nBars\nMr. Evatt\n--\nMr Beavers, S...",430,697,1951
157,12931000_year_1851_type_Article_title_The_Sydn...,SHIPPING INTELLIGENCE.\nARRIVALS.\nOCTOBER 4.-...,156,314,1851


In [38]:

def split_content_into_lines(df):
    # Split the 'content' column into individual lines
    lines = df.apply(lambda row: [(row['file_name'], line) for line in row['content'].split('\n')], axis=1)

    # Create a new DataFrame with the file name, line number, and content
    line_df = pd.DataFrame({
        'file_name': [file_name for sublist in lines for file_name, _ in sublist],
        'line_number': [i for sublist in lines for i, _ in enumerate(sublist, start=1)],
        'content': [line for sublist in lines for _, line in sublist]
    })

    return line_df

In [39]:
split_content_into_lines(overproof_raw.loc[0:10,:])

Unnamed: 0,file_name,line_number,content
0,15196076_year_1910_type_Article_title_The_Sydn...,1,DIVORCE COURT.
1,15196076_year_1910_type_Article_title_The_Sydn...,2,(Before Mr. Justice Gordon.)
2,15196076_year_1910_type_Article_title_The_Sydn...,3,DECREE NISI.
3,15196076_year_1910_type_Article_title_The_Sydn...,4,Matthews v Matthews.
4,15196076_year_1910_type_Article_title_The_Sydn...,5,"Mr. Perry, instructed by Mr. F. W McCarthy,"
...,...,...,...
537,17608047_year_1939_type_Article_title_The_Sydn...,30,"The German cargo steamer Erlangen, which"
538,17608047_year_1939_type_Article_title_The_Sydn...,31,"was in port at Dunedin, New Zealand, for"
539,17608047_year_1939_type_Article_title_The_Sydn...,32,"part of the week-end, put to sea yesterday,"
540,17608047_year_1939_type_Article_title_The_Sydn...,33,sailing for Port Kembla.


# One offs and clean up type stuff

In [33]:
import os
import shutil
#This code is to move all the raw ocr text files to a new folder when I have matching transcriptions. 
#This means I can then run the API calls on data I actually have transcribed. This will mean I can compare faster
# and don't correct text for which I never get a transcription

# Specify the paths of the folders
folder_a = "data/transcription_raw_ocr"
folder_b = "data/transcription_returned_ocr/transcription_files"
folder_c = "data/raw_test_ocr_text"

# Get the list of files in folder A
files_in_a = os.listdir(folder_a)

# Get the list of folders in folder B
files_in_b = os.listdir(folder_b) 

# Iterate over the files in folder A
for file_name in files_in_a:
    # Check if the file name matches a folder name in folder B
    if file_name in files_in_b:
        # Construct the source and destination file paths
        source_file = os.path.join(folder_a, file_name)
        destination_file = os.path.join(folder_c, file_name)
        
        # Copy the file from folder A to folder C
        shutil.copy2(source_file, destination_file)
        print(f"Copied {file_name} from folder A to folder C.")

print("File copying completed.")

Copied slug_ar04900_periodical_ewj_issue_ewj_01051860_page_number_49.txt from folder A to folder C.
Copied slug_ar00200_periodical_l_issue_cld_30071853_page_number_2.txt from folder A to folder C.
Copied slug_ar01400_periodical_ns_issue_ns2_02101852_page_number_14.txt from folder A to folder C.
Copied slug_ar00703_periodical_l_issue_vm2-ncseproduct2102_page_number_7.txt from folder A to folder C.
Copied slug_ar00702_periodical_l_issue_vm2-ncseproduct2102_page_number_7.txt from folder A to folder C.
Copied slug_ar05401_periodical_mruc_issue_vm2-ncseproduct2380_page_number_54.txt from folder A to folder C.
Copied slug_ar00201_periodical_l_issue_cld_30071853_page_number_2.txt from folder A to folder C.
Copied slug_ar00601_periodical_l_issue_vm2-ncseproduct1932_page_number_6.txt from folder A to folder C.
Copied slug_ar00108_periodical_t_issue_ttw_14091867_page_number_1.txt from folder A to folder C.
Copied slug_ar01700_periodical_ewj_issue_ewj_01031859_page_number_17.txt from folder A to 

In [3]:
files_in_a 

['slug_ar01400_periodical_ns_issue_ns2_02101852_page_number_14.txt',
 'slug_ar05401_periodical_mruc_issue_vm2-ncseproduct2380_page_number_54.txt',
 'slug_ar00601_periodical_l_issue_vm2-ncseproduct1932_page_number_6.txt',
 'slug_ar00108_periodical_t_issue_ttw_14091867_page_number_1.txt',
 'slug_ar00104_periodical_t_issue_ttw_14091867_page_number_1.txt',
 'slug_ar00109_periodical_t_issue_ttw_14091867_page_number_1.txt',
 'slug_ad04401_periodical_pc_issue_tec_17011880_page_number_44.txt',
 'slug_ar05500_periodical_ewj_issue_ewj_01061858_page_number_55.txt',
 'slug_ar00106_periodical_t_issue_ttw_14091867_page_number_1.txt',
 'slug_ar00600_periodical_l_issue_vm2-ncseproduct1932_page_number_6.txt',
 'slug_ar05403_periodical_mruc_issue_vm2-ncseproduct2380_page_number_54.txt',
 'slug_ar05402_periodical_mruc_issue_vm2-ncseproduct2380_page_number_54.txt',
 'slug_ar01403_periodical_ns_issue_ns2_02101852_page_number_14.txt',
 'slug_ar01402_periodical_ns_issue_ns2_02101852_page_number_14.txt',
 'sl