In [1]:
import pandas as pd

from helpers import sql

# pandas formatting
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 200)

# import tables
documents = sql("SELECT * FROM csas2_document")
documenttype = sql("SELECT * FROM csas2_documenttype")
tracking = sql("SELECT * FROM csas2_documenttracking")
meetings = sql("SELECT * FROM csas2_meeting")


In [2]:
# unpublished docs with meetings <= Dec 2020

# Jul #s (approx, backup of db from aug 7)
unpub_2020_aug8 = sql(
    query="""
        SELECT
            csas2_document.id AS document_id,
            csas2_document.status,
            csas2_document.lead_office_id AS region,
            csas2_document.created_at AS document_created,
            csas2_meeting.id AS meeting_id,
            csas2_process.id AS process_id,
            csas2_meeting.start_date AS meeting_date,
            csas2_documenttype.name AS doc_type
        FROM csas2_document
            LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
            LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
            LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
        WHERE DATE(csas2_meeting.start_date) < '2021-01-01'
            AND csas2_document.status NOT IN (12, 17, 99);
    """, 
    database='dmapps_240807'
)  
unpub_2020_aug8 = unpub_2020_aug8[unpub_2020_aug8['doc_type'] != "Document for translation only (e.g., meeting minutes, terms of reference)"]

# most recent db #s
unpub_2020 = sql("""
    SELECT
        csas2_document.id AS document_id,
        csas2_document.status,
        csas2_document.lead_office_id AS region,
        csas2_document.created_at AS document_created,
        csas2_meeting.id AS meeting_id,
        csas2_process.id AS process_id,
        csas2_meeting.start_date AS meeting_date,
        csas2_documenttype.name AS doc_type
    FROM csas2_document
        LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
        LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
        LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
    WHERE DATE(csas2_meeting.start_date) < '2021-01-01'
        AND csas2_document.status NOT IN (12, 17, 99);
""")  
unpub_2020 = unpub_2020[unpub_2020['doc_type'] != "Document for translation only (e.g., meeting minutes, terms of reference)"]


In [3]:
col_order = ['Proceedings', 'Research Document', 'Science Advisory Report']

print("Outstanding Publication by Type (Meetings December 2020 and Before)")

df = pd.concat(
    [
        pd.DataFrame(unpub_2020_aug8.drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order]),
        pd.DataFrame(unpub_2020.drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order])
    ],
    axis=1
)
df.columns = ['Aug 2024', 'Dec 2024']
display(df)

print("excluding status = 0")

df = pd.concat(
    [
        pd.DataFrame(unpub_2020_aug8[unpub_2020_aug8.status != 0].drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order]),
        pd.DataFrame(unpub_2020[unpub_2020.status != 0].drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order])
    ],
    axis=1
)
df.columns = ['Aug 2024', 'Dec 2024']
display(df)


Outstanding Publication by Type (Meetings December 2020 and Before)


Unnamed: 0_level_0,Aug 2024,Dec 2024
doc_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Proceedings,28,17
Research Document,63,54
Science Advisory Report,17,15


excluding status = 0


Unnamed: 0_level_0,Aug 2024,Dec 2024
doc_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Proceedings,28,17
Research Document,63,54
Science Advisory Report,17,15


# what is different aug to present?

In [4]:
different = set(unpub_2020_aug8.document_id) - set(unpub_2020.document_id)
different
documents[documents.id.isin(different)]

# all look good, either 99 or 12 (or in Prod 12)

Unnamed: 0,id,created_at,updated_at,title_en,title_fr,title_iku,pub_number,pages_en,status,old_id,created_by_id,process_id,updated_by_id,url_en,url_fr,dev_link_en,dev_link_fr,ekme_gcdocs_en,ekme_gcdocs_fr,cat_number_en,cat_number_fr,document_type_id,translation_status,pub_number_request_date,due_date,is_confirmed,lead_office_id,pages_fr,pdf_size_kb_en,pdf_size_kb_fr,cat_number_iku,library_link_en,library_link_fr,library_link_iku,pdf_size_kb_iku,isbn_en,isbn_fr,isbn_iku,pages_iku,ekme_gcdocs_iku,pub_number_assigned_date,posting_notification_sent_date,has_data_links,has_third_language,third_language,urgency_notes,urgent,media_attention,sharepoint_archive_en,sharepoint_archive_fr
276,305,2023-05-08 12:46:07.166167,2024-10-25 21:35:50.866006,Assessment of Northern Shrimp (Pandalus borealis) and Striped Shrimp (Pandalus montagui) in the NSRF-DFO Assessment Zone (SFA 2 and 3),,,,,99,6111.0,1142,459,,,,,,,,,,3,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
287,329,2023-05-08 12:46:32.867977,2024-09-13 16:21:28.075329,Optimal Strategy for Invasive Species Control to Ensure Survival and Recovery of Atlantic Whitefish in the Petite Rivière Lakes,Stratégie optimale de contrôle des espèces envahissantes en vue d’assurer la survie et le rétablissement du corégone de l’Atlantique dans les lacs de la Petite Rivière,,2024/045,21.0,12,9394.0,1142,464,2359.0,https://wwwstg.ncr.dfo-mpo.ca/csas-sccs/Publications/SAR-AS/2024/2024_045-eng.html,https://wwwstg.ncr.dfo-mpo.ca/csas-sccs/Publications/SAR-AS/2024/2024_045-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/SAR-AS/2024/2024_045-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/SAR-AS/2024/2024_045-fra.html,,,Fs70-6/2024-045E-PDF,Fs70-6/2024-045F-PDF,2,2,NaT,NaT,1,2.0,23.0,1166.0,966.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41257352.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41257406.pdf,,,978-0-660-72991-6,978-0-660-72992-3,,,,2024-07-29 03:00:00,2024-08-30 18:22:06.545017,,,Inuktitut,,,,,
318,376,2023-05-08 12:47:19.043137,2024-09-13 16:13:12.896925,"Proceedings of the Regional Peer Review of Existing Data, Protocols, and Procedures for the Gully Marine Protected Area Ecosystem Monitoring Plan; September 25-26, 2012","Compte rendu du l’examen régional par les pairs des données existantes, des procédures et des protocoles du plan de surveillance de l’écosystème de la zone de protection marine du Gully ; du 25 au...",,2024/033,43.0,12,7791.0,1142,473,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_033-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_033-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_033-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_033-fra.html,,,Fs70-4/2024-033E-PDF,Fs70-4/2024-033F-PDF,4,2,NaT,NaT,1,2.0,48.0,489.0,554.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41256244.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41256256.pdf,,,978-0-660-72430-0,978-0-660-72431-7,,,,2024-06-17 03:00:00,2024-08-14 17:15:54.917289,,,Inuktitut,,,,,
388,446,2023-05-08 12:48:23.029711,2024-10-25 21:38:23.644785,"Evaluation of Capture Efficiency and Mesh-Sized Gillnet Selectivity for Important Fishes in Great Slave Lake, Northwest Territories, Canada",,,,,99,8114.0,1142,481,,,,,,,,,,3,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
474,553,2023-05-08 12:50:09.363999,2024-10-25 21:40:10.741503,"Biological characteristics of the up-(1992) and down (1993) –stream run of anadromous Arctic charr at the Kuujjua River, Northwest Territories, Canada",,,,,99,9000.0,1142,503,,,,,,,,,,3,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,856,2023-05-08 12:54:46.482712,2024-09-13 16:18:51.051979,Updated Information on Atlantic Salmon (Salmo salar) Eastern Cape Breton Populations (ECB; Salmon Fishing Area 19) of Relevance to the Development of a 2nd COSEWIC Status Report,Information actualisée sur les populations de saumon atlantique (Salmo salar) de l’est du Cap-Breton (ECB; zone de pêche du saumon 19) pertinente pour l’élaboration d’un 2e rapport de situation du...,,2024/049,74.0,12,10293.0,1142,65,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_049-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_049-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_049-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_049-fra.html,,,Fs70-5/2024-049E-PDF,Fs70-5/2024-049F-PDF,3,1,NaT,NaT,1,2.0,82.0,1580.0,1704.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41256785.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41256827.pdf,,,978-0-660-72330-3,978-0-660-72331-0,,,,2024-06-11 03:00:00,2024-08-26 21:45:01.842774,,,Inuktitut,,,,,
751,857,2023-05-08 12:54:47.206922,2024-10-09 16:25:07.998623,Updated Information on Atlantic Salmon (Salmo salar) Populations in Southwest New Brunswick (Outer Portion of Salmon Fishing Area 23) of Relevance to the Development of a 2nd COSEWIC Status Report,Mise à jour de l’information sur les populations de saumon atlantique (Salmo salar) du sud-ouest du Nouveau-Brunswick (partie extérieure de la zone de pêche du saumon 23) pertinente pour l’élabora...,,2024/051,104.0,12,10294.0,1142,65,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_051-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_051-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_051-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_051-fra.html,,,Fs70-5/2024-051E-PDF,Fs70-5/2024-051F-PDF,3,0,NaT,NaT,1,2.0,111.0,3757.0,5371.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260570.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260594.pdf,,,978-0-660-72334-1,978-0-660-72335-8,,,,2024-06-11 03:00:00,2024-09-11 21:21:04.284225,0.0,0.0,Inuktitut,,0.0,1.0,,
752,858,2023-05-08 12:54:47.968776,2024-10-09 16:24:19.814974,Updated Information on Atlantic Salmon (Salmo salar) Inner Bay of Fundy Populations (IBoF; part of Salmon Fishing Areas 22 and 23) of Relevance to the Development of a 2nd COSEWIC Status Report,Information actualisée sur la population de saumon atlantique (Salmo salar) de l’intérieur de la baie de Fundy (zones de pêche du saumon 22 et 23) pertinente pour l’élaboration d’un deuxième rappo...,,2024/057,105.0,12,10295.0,1142,65,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_057-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_057-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_057-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_057-fra.html,,,Fs70-5/2024-057E-PDF,Fs70-5/2024-057F-PDF,3,0,NaT,NaT,1,2.0,117.0,3524.0,3728.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126051x.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260545.pdf,,,978-0-660-72539-0,978-0-660-72540-6,,,,2024-06-25 03:00:00,2024-09-11 21:21:23.510301,0.0,0.0,Inuktitut,,0.0,1.0,,
1142,1279,2024-04-04 12:49:34.932905,2024-10-09 16:27:25.444094,Science Advice on a Performance Threshold for the Management Strategy Evaluation for Southwest Nova Scotia/Bay Of Fundy Atlantic Herring (Clupea harengus),Avis scientifique sur un seuil de rendement pour l’évaluation de la stratégie de gestion du hareng de l’Atlantique (Clupea harengus) du sud-ouest de la Nouvelle-Écosse et la baie de Fundy,,2024/048,24.0,12,,456,737,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_048-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_048-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/SAR-AS/2024/2024_048-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/SAR-AS/2024/2024_048-fra.html,,,Fs70-6/2024-048E-PDF,Fs70-6/2024-048F-PDF,2,3,NaT,NaT,1,2.0,26.0,840.0,1912.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260922.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260934.pdf,,,978-0-660-73133-9,978-0-660-73134-6,,,,2024-08-12 03:00:00,2024-09-12 16:03:08.767035,0.0,0.0,Inuktitut,,0.0,1.0,,


# why doesn't this match?

In [5]:
# these should all be the same

id_report_local = [
    469, 395, 374, 478, 473, 498, 286, 287, 288, 417, 420, 418, 419, 416, 536, 549, 575, 576, 574, 291, 290, 610, 614, 609, 626, 616, 
    625, 624, 623, 619, 620, 663, 636, 635, 669, 667, 665, 668, 666, 664, 655, 643, 422, 423, 697, 698, 981, 926, 924, 925, 709, 710, 
    715, 716, 708, 712, 267, 741, 685, 686, 841, 696, 744, 263, 764, 782, 781, 747, 791, 790, 683, 726, 3, 829, 980, 978, 979, 821, 
    754, 753, 836, 695, 833, 1058, 1059, 691, 4, 1308
]
id_report_prod = [
    469, 395, 374, 478, 473, 498, 286, 287, 288, 417, 420, 418, 419, 416, 536, 549, 575, 576, 574, 291, 290, 610, 614, 609, 626, 
    616, 625, 624, 623, 619, 620, 663, 636, 635, 669, 667, 665, 668, 666, 664, 655, 643, 422, 423, 697, 698, 981, 926, 924, 925, 
    709, 710, 715, 716, 708, 712, 267, 685, 686, 841, 696, 744, 764, 782, 781, 747, 790, 683, 726, 3, 829, 980, 978, 979, 821, 
    754, 753, 836, 833, 1058, 1059, 1308
]
id_sql_local = list(set(unpub_2020.document_id))

In [6]:
[x for x in id_report_prod if x not in id_sql_local]

[3]

In [7]:
# THESE SHOULD BE THE SAME!!!

missing_from_sql = [x for x in id_report_local if x not in id_sql_local]
missing_from_sql

[3, 4]

In [8]:
# SQL has missing, but the the report does not have any missing
[x for x in id_sql_local if x not in id_report_local]

[]

In [9]:
documents[documents.id.isin(missing_from_sql)][['id', 'document_type_id']]
# just translations only docs

Unnamed: 0,id,document_type_id
2,3,8
3,4,8


In [10]:
# nothing missing from the report that is in sql (locally)
missing_from_report = [x for x in id_sql_local if x not in id_report_local]
missing_from_report

[]

In [11]:
missing_from_report_prod = [x for x in id_sql_local if x not in id_report_prod]
missing_from_report_prod

# CHECK OK: all of these have since been POSTED or WITHDRAWN

[691, 695, 741, 263, 791]

# how many incomplete but with a pub_number?

In [12]:
status = {
   0: "awaiting_changes",
   1: "confirmed",
   2: "submission_date",
   3: "date_chair_sent",
   4: "date_chair_appr",
   5: "date_coordinator_sent",
   6: "date_coordinator_appr",
   13: "date_section_head_sent",
   14: "date_section_head_appr",
   15: "date_division_manager_sent",
   16: "date_division_manager_appr",
   7: "date_director_sent",
   8: "date_director_appr",
   9: "date_doc_submitted",
   10: "date_proof_author_sent",
   11: "date_proof_author_approved",
   12: "actual_posting_date",
   17: "updated_posting_date",
   99: "withdrawn",
}

documents['status_display'] = documents['status'].replace(status)

document_type_id = {
    2: 'Science Advisory Report',
    3: 'Research Document',
    4: 'Proceedings',
    5: 'Science Response',
    6: 'Working Paper',
    8: "Document for translation only (e.g., meeting minutes, terms of reference)",
}

documents['document_type_display'] = documents['document_type_id'].replace(document_type_id)

documents[~documents.status.isin([12, 17, 99]) & documents.pub_number][['id', 'pub_number', 'status', 'status_display', 'document_type_id', 'document_type_display']]

Unnamed: 0,id,pub_number,status,status_display,document_type_id,document_type_display
51,62,2024/056,16,date_division_manager_appr,3,Research Document
124,140,2024/072,16,date_division_manager_appr,3,Research Document
134,150,2024/064,9,date_doc_submitted,2,Science Advisory Report
144,160,2024/071,6,date_coordinator_appr,3,Research Document
208,226,2024/075,1,confirmed,3,Research Document
...,...,...,...,...,...,...
1163,1300,2024/063,6,date_coordinator_appr,3,Research Document
1183,1320,2024/069,15,date_division_manager_sent,3,Research Document
1196,1334,2024/060,11,date_proof_author_approved,2,Science Advisory Report
1197,1335,2024/058,15,date_division_manager_sent,2,Science Advisory Report


# what about regions?

In [14]:
df = unpub_2020.copy()

regions = {
    1: 'Gulf',
    2: 'Maritimes',
    3: 'Quebec',
    6: 'Newfoundland & Labrador',
    4: 'National',
    7: 'Ontario and Prairie',
    5: 'Pacific',
    8: 'Arctic',
}
df['region'] = df['region'].replace(regions)
df['region'] = pd.Categorical(df['region'], categories=list(regions.values()))

pd.DataFrame(df['region'].value_counts().reindex([
    'Arctic',
    'Pacific',
    'Ontario and Prairie',
    'National',
    'Quebec',
    'Gulf',
    'Maritimes',
    'Newfoundland & Labrador',
]))

Unnamed: 0_level_0,count
region,Unnamed: 1_level_1
Arctic,0
Pacific,4
Ontario and Prairie,29
National,38
Quebec,7
Gulf,1
Maritimes,0
Newfoundland & Labrador,8


In [15]:
# zero Arctic documents? what about after 2020?
documents[documents.lead_office_id == 8]

# none. only fake documents I made to test things...

Unnamed: 0,id,created_at,updated_at,title_en,title_fr,title_iku,pub_number,pages_en,status,old_id,created_by_id,process_id,updated_by_id,url_en,url_fr,dev_link_en,dev_link_fr,ekme_gcdocs_en,ekme_gcdocs_fr,cat_number_en,cat_number_fr,document_type_id,translation_status,pub_number_request_date,due_date,is_confirmed,lead_office_id,pages_fr,pdf_size_kb_en,pdf_size_kb_fr,cat_number_iku,library_link_en,library_link_fr,library_link_iku,pdf_size_kb_iku,isbn_en,isbn_fr,isbn_iku,pages_iku,ekme_gcdocs_iku,pub_number_assigned_date,posting_notification_sent_date,has_data_links,has_third_language,third_language,urgency_notes,urgent,media_attention,sharepoint_archive_en,sharepoint_archive_fr,status_display,document_type_display
1210,1350,2024-11-06 18:42:08.159352,2024-12-11 18:11:17.259737,asdff,fdasss,,,,0,,2874,804,2874.0,,,,,,,,,3,0,NaT,NaT,0,8.0,,,,,,,,,,,,,,NaT,NaT,0.0,0.0,Inuktitut,,0.0,1.0,,,awaiting_changes,Research Document
1211,1351,2024-11-06 18:59:11.453923,2024-11-06 18:59:41.462142,asdff,fdasss,,,,12,,2874,805,,,,,,,,,,4,0,NaT,NaT,1,8.0,,,,,,,,,,,,,,NaT,NaT,1.0,0.0,Inuktitut,asdf,1.0,1.0,,,actual_posting_date,Proceedings
