In [59]:
import pandas as pd

from helpers import sql

# pandas formatting
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 200)

# import tables
documents = sql("SELECT * FROM csas2_document")
documenttype = sql("SELECT * FROM csas2_documenttype")
tracking = sql("SELECT * FROM csas2_documenttracking")
meetings = sql("SELECT * FROM csas2_meeting")


In [2]:
# unpublished docs with meetings <= Dec 2020

# Jul #s (approx, backup of db from aug 7)
unpub_2020_aug8 = sql(
    query="""
        SELECT
            csas2_document.id AS document_id,
            csas2_document.status AS status,
            csas2_document.created_at AS document_created,
            csas2_meeting.id AS meeting_id,
            csas2_process.id AS process_id,
            csas2_meeting.start_date AS meeting_date,
            csas2_documenttype.name AS doc_type
        FROM csas2_document
            LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
            LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
            LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
        WHERE DATE(csas2_meeting.start_date) < '2021-01-01'
            AND csas2_document.pub_number IS NULL
            AND csas2_document.status NOT IN (12, 17, 99);
    """, 
    database='dmapps_240807'
)  
unpub_2020_aug8 = unpub_2020_aug8[unpub_2020_aug8['doc_type'] != "Document for translation only (e.g., meeting minutes, terms of reference)"]

# most recent db #s
unpub_2020 = sql("""
    SELECT
        csas2_document.id AS document_id,
        csas2_document.status AS status,
        csas2_document.created_at AS document_created,
        csas2_meeting.id AS meeting_id,
        csas2_process.id AS process_id,
        csas2_meeting.start_date AS meeting_date,
        csas2_documenttype.name AS doc_type
    FROM csas2_document
        LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
        LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
        LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
    WHERE DATE(csas2_meeting.start_date) < '2021-01-01'
        AND csas2_document.pub_number IS NULL
        AND csas2_document.status NOT IN (12, 17, 99);
""")  
unpub_2020 = unpub_2020[unpub_2020['doc_type'] != "Document for translation only (e.g., meeting minutes, terms of reference)"]


In [3]:
# added to dmapps after jul 2024

unpub_2020[unpub_2020.document_created > '2024-07-01'].shape[0], unpub_2020[unpub_2020.document_created <= '2024-07-01'].shape[0]


(0, 82)

In [4]:
col_order = ['Proceedings', 'Research Document', 'Science Advisory Report']

print("Outstanding Publication by Type (Meetings December 2020 and Before)")

df = pd.concat(
    [
        pd.DataFrame(unpub_2020_aug8.drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order]),
        pd.DataFrame(unpub_2020.drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order])
    ],
    axis=1
)
df.columns = ['Aug 2024', 'Dec 2024']
display(df)

print("excluding status = 0")

df = pd.concat(
    [
        pd.DataFrame(unpub_2020_aug8[unpub_2020_aug8.status != 0].drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order]),
        pd.DataFrame(unpub_2020[unpub_2020.status != 0].drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order])
    ],
    axis=1
)
df.columns = ['Aug 2024', 'Dec 2024']
display(df)


Outstanding Publication by Type (Meetings December 2020 and Before)


Unnamed: 0_level_0,Aug 2024,Dec 2024
doc_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Proceedings,23,15
Research Document,56,51
Science Advisory Report,16,15


excluding status = 0


Unnamed: 0_level_0,Aug 2024,Dec 2024
doc_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Proceedings,23,15
Research Document,56,51
Science Advisory Report,16,15


In [5]:
different = set(unpub_2020_aug8.document_id) - set(unpub_2020.document_id)
different
documents[documents.id.isin(different)]

# all look good, either 99 or 12 (or in Prod 12)

Unnamed: 0,id,created_at,updated_at,title_en,title_fr,title_iku,pub_number,pages_en,status,old_id,created_by_id,process_id,updated_by_id,url_en,url_fr,dev_link_en,dev_link_fr,ekme_gcdocs_en,ekme_gcdocs_fr,cat_number_en,cat_number_fr,document_type_id,translation_status,pub_number_request_date,due_date,is_confirmed,lead_office_id,pages_fr,pdf_size_kb_en,pdf_size_kb_fr,cat_number_iku,library_link_en,library_link_fr,library_link_iku,pdf_size_kb_iku,isbn_en,isbn_fr,isbn_iku,pages_iku,ekme_gcdocs_iku,pub_number_assigned_date,posting_notification_sent_date,has_data_links,has_third_language,third_language,urgency_notes,urgent,media_attention,sharepoint_archive_en,sharepoint_archive_fr
276,305,2023-05-08 12:46:07.166167,2024-10-25 21:35:50.866006,Assessment of Northern Shrimp (Pandalus borealis) and Striped Shrimp (Pandalus montagui) in the NSRF-DFO Assessment Zone (SFA 2 and 3),,,,,99,6111.0,1142,459,,,,,,,,,,3,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
388,446,2023-05-08 12:48:23.029711,2024-10-25 21:38:23.644785,"Evaluation of Capture Efficiency and Mesh-Sized Gillnet Selectivity for Important Fishes in Great Slave Lake, Northwest Territories, Canada",,,,,99,8114.0,1142,481,,,,,,,,,,3,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
474,553,2023-05-08 12:50:09.363999,2024-10-25 21:40:10.741503,"Biological characteristics of the up-(1992) and down (1993) –stream run of anadromous Arctic charr at the Kuujjua River, Northwest Territories, Canada",,,,,99,9000.0,1142,503,,,,,,,,,,3,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
522,618,2023-05-08 12:51:06.234012,2024-10-25 21:43:20.302499,"Proceedings of the regional peer review of stock status and sustainable harvest levels for Arctic Char in Naulinniarvik Lake Arctic Char fishery, Nunavut",,,,,99,9248.0,1142,512,,,,,,,,,,4,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
525,621,2023-05-08 12:51:09.345843,2024-10-25 21:41:58.994432,"Proceedings of the regional peer review of stock status and sustainable harvest levels for Arctic Char in Ijaruvung Lake, Iqalujjuaq Fiord and Irvine Inlet, Cumberland Sound, Nunavut",,,,,99,9264.0,1142,513,,,,,,,,,,4,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
569,671,2023-05-08 12:51:57.624443,2024-10-09 16:31:55.905176,"Proceedings of the Regional Peer Review of the Guidelines on Priorities, Monitoring, and Provision of Science Advice for Small-Scale Fisheries in the Maritimes Region; August 21-22, 2018 and Janua...","Compte rendu de l’examen par les pairs régional sur les lignes directrices relatives aux priorités, aux activités de surveillance et à la prestation d’avis scientifiques en lien avec les pêches pr...",,2024/039,37.0,12,9639.0,1142,529,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_039-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_039-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_039-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_039-fra.html,,,Fs70-4/2024-039E-PDF,Fs70-4/2024-039F-PDF,4,2,NaT,NaT,1,2.0,40.0,777.0,775.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260417.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260430.pdf,,,978-0-660-73346-3,978-0-660-73347-0,,,,2024-08-26 03:00:00.000000,2024-09-18 23:34:19.169830,0.0,0.0,Inuktitut,,0.0,1.0,,
580,682,2023-05-08 12:52:07.894562,2024-10-25 21:45:28.340617,"Information to support for the assessment of Northern Shrimp, Pandalus borealis, and Striped Shrimp, Pandalus montagui, in the Eastern and Western Assessment Zones, February 2019",,,,,99,9713.0,1142,533,,,,,,,,,,3,99,NaT,NaT,1,7.0,,,,,,,,,,,,,,NaT,NaT,,,Inuktitut,,,,,
592,695,2023-05-08 12:52:21.010864,2024-10-07 15:28:24.030836,CONSIDERATIONS FOR THE AUTHORIZATION OF BOTTOM-CONTACTING SCIENTIFIC SURVEYS WITHIN PROTECTED AREAS IN THE NEWFOUNDLAND AND LABRADOR REGION,,,2024/073,,9,10243.0,1142,539,,,,,,,,,,3,0,NaT,NaT,1,6.0,,,,,,,,,,,,,,2024-10-07 15:28:24.030704,NaT,,,Inuktitut,,,,,
691,795,2023-05-08 12:53:49.566729,2024-10-09 16:29:01.815877,"Proceedings of the Regional Peer Review of a Framework for the Assessment of Snow Crab (Chionoecetes opilio) in Maritimes Region (NAFO DIV 4VWX); February 25-26, 2020",Compte rendu de l’examen régional par les pairs d’un cadre d’évaluation du crabe des neiges (Chionoecetes opilio) dans la région des Maritimes (division 4VWX de l’OPANO); du 25 au 26 février 2020,,2024/038,13.0,12,10128.0,1142,564,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_038-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_038-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_038-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_038-fra.html,,,Fs70-4/2024-038E-PDF,Fs70-4/2024-038F-PDF,4,2,NaT,NaT,1,2.0,14.0,351.0,362.0,,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126034x.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41260351.pdf,,,978-0-660-73344-9,978-0-660-73345-6,,,,2024-08-26 03:00:00.000000,2024-09-16 22:17:39.908168,0.0,0.0,Inuktitut,,0.0,1.0,,
692,796,2023-05-08 12:53:50.890545,2024-10-22 15:18:34.104062,"Proceedings of the Regional Peer Review of the Southwest Nova Scotia/Bay of Fundy Herring Framework: Part 2 – Management Strategy Evaluation Conditioning Operating Model Review; January 20-21, 2020",Compte rendu de l’examen régional par les pairs du cadre de travail sur le hareng du sud-ouest de la Nouvelle-Écosse et de la baie de Fundy : Partie 2 – Examen du modèle opérationnel de conditionn...,,2024/040,23.0,12,10129.0,1142,565,2359.0,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-eng.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-fra.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-eng.html,https://wwwdev.ncr.dfo-mpo.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-fra.html,,,Fs70-4/2024-040E-PDF,Fs70-4/2024-040F-PDF,4,3,NaT,NaT,1,2.0,23.0,431.0,392.0,,,,,,978-0-660-73656-3,978-0-660-73657-0,,,,2024-08-26 03:00:00.000000,2024-10-22 15:18:34.093877,0.0,0.0,Inuktitut,,0.0,1.0,,


In [6]:
# why doesn't this match?

# TODO: 
#  check the report, compare - DOES NOT MATCH EITHER
#  get a new database and check again - CHECKED REPORT FROM PROD, DOES NOT MATCH ANY OF THE ABOVE THINGS
#  confirm 12, 17, 99 for unwanted status (what about 0? unconfirmed) - DOES NOT MATCH WITH OR WITHOUT 0

# literally nothing matches anything else.


In [34]:
# these should all be the same

id_report_local = [
    469, 395, 374, 478, 473, 498, 286, 287, 288, 417, 420, 418, 419, 416, 536, 549, 575, 576, 574, 291, 290, 610, 614, 609, 626, 616, 
    625, 624, 623, 619, 620, 663, 636, 635, 669, 667, 665, 668, 666, 664, 655, 643, 422, 423, 697, 698, 981, 926, 924, 925, 709, 710, 
    715, 716, 708, 712, 267, 741, 685, 686, 841, 696, 744, 263, 764, 782, 781, 747, 791, 790, 683, 726, 3, 829, 980, 978, 979, 821, 
    754, 753, 836, 695, 833, 1058, 1059, 691, 4, 1308
]
id_report_prod = [
    469, 395, 374, 478, 473, 498, 286, 287, 288, 417, 420, 418, 419, 416, 536, 549, 575, 576, 574, 291, 290, 610, 614, 609, 626, 
    616, 625, 624, 623, 619, 620, 663, 636, 635, 669, 667, 665, 668, 666, 664, 655, 643, 422, 423, 697, 698, 981, 926, 924, 925, 
    709, 710, 715, 716, 708, 712, 267, 685, 686, 841, 696, 744, 764, 782, 781, 747, 790, 683, 726, 3, 829, 980, 978, 979, 821, 
    754, 753, 836, 833, 1058, 1059, 1308
]
id_sql_local = list(set(unpub_2020.document_id))

In [35]:
[x for x in id_report_prod if x not in id_sql_local]

[668, 926, 924, 3]

In [36]:
# THESE SHOULD BE THE SAME!!!

missing_from_sql = [x for x in id_report_local if x not in id_sql_local]
missing_from_sql

[668, 926, 924, 741, 3, 695, 4]

In [37]:
# SQL is missing, but the report is not missing
[x for x in id_sql_local if x not in id_report_local]

[]

In [38]:
documents[documents.id.isin(missing_from_sql)][['id', 'document_type_id']]

Unnamed: 0,id,document_type_id
2,3,8
3,4,8
566,668,3
592,695,3
637,741,4
812,924,4
814,926,3


In [25]:
documenttype[documenttype.document_id.is_in(missing_from_sql)]

AttributeError: 'DataFrame' object has no attribute 'document_id'

In [26]:
documenttype

Unnamed: 0,id,name,nom,hide_from_list,days_due,acronym_en,acronym_fr
0,2,Science Advisory Report,Avis scientifique,0,56.0,,
1,3,Research Document,Document de recherche,0,122.0,,
2,4,Proceedings,Compte rendu,0,122.0,,
3,5,Science Response,Réponse des Sciences,0,56.0,,
4,6,Working Paper,Document de travail,0,,,
5,8,"Document for translation only (e.g., meeting minutes, terms of reference)",,0,,,


In [40]:
# 668 - Research Doc Missing from SQL query???

unpub_2020[unpub_2020.document_id == 668]


Unnamed: 0,document_id,status,document_created,meeting_id,process_id,meeting_date,doc_type


In [43]:
# original query
"""
    SELECT
        csas2_document.id AS document_id,
        csas2_document.status AS status,
        csas2_document.created_at AS document_created,
        csas2_meeting.id AS meeting_id,
        csas2_process.id AS process_id,
        csas2_meeting.start_date AS meeting_date,
        csas2_documenttype.name AS doc_type
    FROM csas2_document
        LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
        LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
        LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
    WHERE DATE(csas2_meeting.start_date) < '2021-01-01'
        AND csas2_document.pub_number IS NULL
        AND csas2_document.status NOT IN (12, 17, 99);
"""


df_668 = sql("""
    SELECT
        csas2_document.id AS document_id,
        pub_number,
        csas2_document.status AS status,
        csas2_document.created_at AS document_created,
        csas2_meeting.id AS meeting_id,
        csas2_process.id AS process_id,
        csas2_meeting.start_date AS meeting_date,
        csas2_documenttype.name AS doc_type
    FROM csas2_document
        LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
        LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
        LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
    WHERE csas2_document.id = 668;
""") 

df_668

Unnamed: 0,document_id,pub_number,status,document_created,meeting_id,process_id,meeting_date,doc_type
0,668,2023/084,11,2023-05-08 12:51:53.922718,526,526,2018-02-26 16:00:00,Research Document


In [45]:
# not in SQL because it has a pub_number... why not published?

documents[documents.id == 668].T

# ANSWER: it does not have a status of posted, even though it has a pub_number. maybe don't sql filter by has pub_number...
# ALSO, this seems like an error, it probably is posted...

Unnamed: 0,566
id,668
created_at,2023-05-08 12:51:53.922718
updated_at,2023-11-14 22:11:31.049670
title_en,Review of international protocols and recommended mitigation for the use of autonomous unmanned vehicles (AUVs) in the study of marine mammals
title_fr,
...,...
urgency_notes,
urgent,
media_attention,
sharepoint_archive_en,


In [60]:
# how many incomplete but with a pub_number?

status = {
   0: "awaiting_changes",
   1: "confirmed",
   2: "submission_date",
   3: "date_chair_sent",
   4: "date_chair_appr",
   5: "date_coordinator_sent",
   6: "date_coordinator_appr",
   13: "date_section_head_sent",
   14: "date_section_head_appr",
   15: "date_division_manager_sent",
   16: "date_division_manager_appr",
   7: "date_director_sent",
   8: "date_director_appr",
   9: "date_doc_submitted",
   10: "date_proof_author_sent",
   11: "date_proof_author_approved",
   12: "actual_posting_date",
   17: "updated_posting_date",
   99: "withdrawn",
}

documents['status_display'] = documents['status'].replace(status)

document_type_id = {
    2: 'Science Advisory Report',
    3: 'Research Document',
    4: 'Proceedings',
    5: 'Science Response',
    6: 'Working Paper',
    8: "Document for translation only (e.g., meeting minutes, terms of reference)",
}

documents['document_type_display'] = documents['document_type_id'].replace(document_type_id)

documents[~documents.status.isin([12, 17, 99]) & documents.pub_number][['id', 'pub_number', 'status', 'status_display', 'document_type_id', 'document_type_display']]

Unnamed: 0,id,pub_number,status,status_display,document_type_id,document_type_display
51,62,2024/056,16,date_division_manager_appr,3,Research Document
124,140,2024/072,16,date_division_manager_appr,3,Research Document
134,150,2024/064,9,date_doc_submitted,2,Science Advisory Report
144,160,2024/071,6,date_coordinator_appr,3,Research Document
208,226,2024/075,1,confirmed,3,Research Document
...,...,...,...,...,...,...
1163,1300,2024/063,6,date_coordinator_appr,3,Research Document
1183,1320,2024/069,15,date_division_manager_sent,3,Research Document
1196,1334,2024/060,11,date_proof_author_approved,2,Science Advisory Report
1197,1335,2024/058,15,date_division_manager_sent,2,Science Advisory Report
