In [None]:
pip install ipython-sql pandas duckdb jupysql pandas matplotlib duckdb-engine --quiet

In [None]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect()
%sql conn --alias duckdb

In [None]:
metafile = "src/datasets/bronze/metadata/block_correct_ind.txt"
transkribus_page_nbr = "src/datasets/bronze/metadata/transkribus_input.txt"

In [None]:
%%sql
--DROP table IF EXISTS Metadata;
CREATE TABLE Metadata (Newspaper VARCHAR, Block VARCHAR, Correct_Ind CHAR(3), Line_Amt INTEGER);
INSERT INTO Metadata
SELECT *
FROM read_csv('../../datasets/bronze/metadata/block_correct_ind.txt',
    delim = ';',
    header = false,
    columns = {
        'Newspaper': 'VARCHAR',
        'Block': 'VARCHAR',
        'Correct_Ind': 'CHAR(1)',
        'Line_Amt': 'INTEGER'
    });


In [None]:
%%sql
DROP table IF EXISTS Transkribus_Page_Nbr;
CREATE TABLE Transkribus_Page_Nbr (Doc_Nbr INTEGER, Page_Nbr INTEGER, Newspaper VARCHAR);
 INSERT INTO Transkribus_Page_Nbr
 SELECT *
FROM read_csv('../../transkribus/transkribus_input.txt',
    delim = ' ',
    header = false,
    columns = {
        'Doc_Nbr': 'INTEGER',
        'Page_Nbr': 'INTEGER',
        'Newspaper': 'VARCHAR'
    });

In [None]:
%%sql 
DROP table IF EXISTS State_of_Newspaper;
CREATE TABLE State_of_Newspaper (Newspaper VARCHAR, Page_Nbr INTEGER, Doc_Nbr INTEGER, Full_Line_Amt INTEGER, Correct_Percent INTEGER);
INSERT INTO State_of_Newspaper
SELECT 
Newspaper,
Page_Nbr,
Doc_Nbr,
Full_Line_Amt,
CAST(Correct_Line_Percent AS INTEGER) AS Correct_Line_Percent
--COUNT(*) AS Newspaper_Count
FROM(
SELECT MD.Newspaper,
TPN.Page_Nbr,
TPN.Doc_Nbr,
SUM(CASE WHEN Correct_Ind = 'Y' THEN 1 ELSE 0 END) AS Correct_Count,
SUM(CASE WHEN Correct_Ind = 'N/A' THEN 1 ELSE 0 END) AS NA_Value,
COUNT(*) AS Total_Count,
CAST(SUM(CASE WHEN Correct_Ind <> 'N' THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS INTEGER) AS Correct_Percent,
SUM(CASE WHEN Correct_Ind = 'Y' THEN Line_Amt ELSE 0 END) AS Correct_Line_Amt,
SUM(Line_Amt) AS Full_Line_Amt,
Correct_Line_Amt * 100.0 / Full_Line_Amt AS Correct_Line_Percent
FROM Metadata MD
INNER JOIN Transkribus_Page_Nbr TPN
ON MD.Newspaper = TPN.Newspaper 
GROUP BY MD.Newspaper, TPN.Page_Nbr,TPN.Doc_Nbr
HAVING Correct_Count > 0
AND Full_Line_Amt > 10
ORDER BY Correct_Percent DESC
) A; --WHERE Correct_Line_Percent >= 50;

In [None]:
%%sql
SELECT * FROM State_of_Newspaper ORDER BY Correct_Percent DESC, Full_Line_Amt DESC

In [None]:
%%sql 

-- COPY State_of_Newspaper TO '../transkribus.csv' (HEADER, DELIMITER ',');
COPY
    State_of_Newspaper
    TO '../../../duckdb/state.parquet'
    (FORMAT parquet);

COPY
    Metadata
    TO '../../../duckdb/block_quality.parquet'
    (FORMAT parquet);

COPY
    Transkribus_Page_Nbr
    TO '../../../duckdb/transkribus_metadata.parquet'
    (FORMAT parquet);