In [1]:
import pandas as pd

from helpers import sql

# pandas formatting
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 200)

# import tables
documents = sql("SELECT * FROM csas2_document", database='dmapps_BACKUP')
documenttype = sql("SELECT * FROM csas2_documenttype", database='dmapps_BACKUP')
tracking = sql("SELECT * FROM csas2_documenttracking", database='dmapps_BACKUP')
meetings = sql("SELECT * FROM csas2_meeting", database='dmapps_BACKUP')


In [2]:
# unpublished docs with meetings <= Dec 2020

# Jul #s (approx, backup of db from aug 7)
unpub_2020_aug8 = sql(
    query="""
        SELECT
            csas2_document.id AS document_id,
            csas2_document.status,
            csas2_document.lead_office_id AS region,
            csas2_document.created_at AS document_created,
            csas2_meeting.id AS meeting_id,
            csas2_process.id AS process_id,
            csas2_meeting.start_date AS meeting_date,
            csas2_documenttype.name AS doc_type
        FROM csas2_document
            LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
            LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
            LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
        WHERE DATE(csas2_meeting.start_date) < '2021-01-01'
            AND csas2_document.status NOT IN (12, 17, 99);
    """, 
    database='dmapps_240807'
)  
unpub_2020_aug8 = unpub_2020_aug8[unpub_2020_aug8['doc_type'] != "Document for translation only (e.g., meeting minutes, terms of reference)"]

# most recent db #s
unpub_2020 = sql(
    query="""
        SELECT
            csas2_document.id AS document_id,
            csas2_document.status,
            csas2_document.lead_office_id AS region,
            csas2_document.created_at AS document_created,
            csas2_meeting.id AS meeting_id,
            csas2_process.id AS process_id,
            csas2_meeting.start_date AS meeting_date,
            csas2_documenttype.name AS doc_type
        FROM csas2_document
            LEFT JOIN csas2_documenttype ON csas2_document.document_type_id = csas2_documenttype.id
            LEFT JOIN csas2_process ON csas2_document.process_id = csas2_process.id
            LEFT JOIN csas2_meeting ON csas2_process.id = csas2_meeting.process_id
        WHERE DATE(csas2_meeting.start_date) < '2021-01-01'
            AND csas2_document.status NOT IN (12, 17, 99);
    """, 
    database='dmapps_BACKUP'
)  
unpub_2020 = unpub_2020[unpub_2020['doc_type'] != "Document for translation only (e.g., meeting minutes, terms of reference)"]


In [3]:
col_order = ['Proceedings', 'Research Document', 'Science Advisory Report']

print("Outstanding Publication by Type (Meetings December 2020 and Before)")

df = pd.concat(
    [
        pd.DataFrame(unpub_2020_aug8.drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order]),
        pd.DataFrame(unpub_2020.drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order])
    ],
    axis=1
)
df.columns = ['Aug 2024', 'Dec 2024']
display(df)

print("excluding status = 0")

df = pd.concat(
    [
        pd.DataFrame(unpub_2020_aug8[unpub_2020_aug8.status != 0].drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order]),
        pd.DataFrame(unpub_2020[unpub_2020.status != 0].drop_duplicates(subset='document_id', keep='first')['doc_type'].value_counts()[col_order])
    ],
    axis=1
)
df.columns = ['Aug 2024', 'Dec 2024']
display(df)


Outstanding Publication by Type (Meetings December 2020 and Before)


Unnamed: 0_level_0,Aug 2024,Dec 2024
doc_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Proceedings,28,13
Research Document,63,53
Science Advisory Report,17,15


excluding status = 0


Unnamed: 0_level_0,Aug 2024,Dec 2024
doc_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Proceedings,28,13
Research Document,63,53
Science Advisory Report,17,15


# what is different aug to present?

In [21]:
different = set(unpub_2020_aug8.document_id) - set(unpub_2020.document_id)
different
documents[documents.id.isin(different)]['status'].value_counts()

# all look good, either 99 or 12

status
12    19
99     8
Name: count, dtype: int64

# how many incomplete but with a pub_number?

In [12]:
status = {
   0: "awaiting_changes",
   1: "confirmed",
   2: "submission_date",
   3: "date_chair_sent",
   4: "date_chair_appr",
   5: "date_coordinator_sent",
   6: "date_coordinator_appr",
   13: "date_section_head_sent",
   14: "date_section_head_appr",
   15: "date_division_manager_sent",
   16: "date_division_manager_appr",
   7: "date_director_sent",
   8: "date_director_appr",
   9: "date_doc_submitted",
   10: "date_proof_author_sent",
   11: "date_proof_author_approved",
   12: "actual_posting_date",
   17: "updated_posting_date",
   99: "withdrawn",
}

documents['status_display'] = documents['status'].replace(status)

document_type_id = {
    2: 'Science Advisory Report',
    3: 'Research Document',
    4: 'Proceedings',
    5: 'Science Response',
    6: 'Working Paper',
    8: "Document for translation only (e.g., meeting minutes, terms of reference)",
}

documents['document_type_display'] = documents['document_type_id'].replace(document_type_id)

documents[~documents.status.isin([12, 17, 99]) & documents.pub_number][['id', 'pub_number', 'status', 'status_display', 'document_type_id', 'document_type_display']]

Unnamed: 0,id,pub_number,status,status_display,document_type_id,document_type_display
50,62,2024/056,9,date_doc_submitted,3,Research Document
123,140,2024/072,9,date_doc_submitted,3,Research Document
143,160,2024/071,6,date_coordinator_appr,3,Research Document
204,223,2025/001,9,date_doc_submitted,3,Research Document
565,668,2023/084,11,date_proof_author_approved,3,Research Document
811,924,2022/006,11,date_proof_author_approved,4,Proceedings
813,926,2023/090,9,date_doc_submitted,3,Research Document
919,1040,2024/028,10,date_proof_author_sent,3,Research Document
928,1050,2024/066,5,date_coordinator_sent,3,Research Document
1005,1131,2025/002,9,date_doc_submitted,3,Research Document


# what about regions?

In [13]:
df = unpub_2020.copy()

regions = {
    1: 'Gulf',
    2: 'Maritimes',
    3: 'Quebec',
    6: 'Newfoundland & Labrador',
    4: 'National',
    7: 'Ontario and Prairie',
    5: 'Pacific',
    8: 'Arctic',
}
df['region'] = df['region'].replace(regions)
df['region'] = pd.Categorical(df['region'], categories=list(regions.values()))

pd.DataFrame(df['region'].value_counts().reindex([
    'Arctic',
    'Pacific',
    'Ontario and Prairie',
    'National',
    'Quebec',
    'Gulf',
    'Maritimes',
    'Newfoundland & Labrador',
]))

Unnamed: 0_level_0,count
region,Unnamed: 1_level_1
Arctic,0
Pacific,4
Ontario and Prairie,27
National,38
Quebec,7
Gulf,1
Maritimes,0
Newfoundland & Labrador,5


In [14]:
# zero Arctic documents? what about after 2020?
documents[documents.lead_office_id == 8]

# none.

Unnamed: 0,id,created_at,updated_at,title_en,title_fr,title_iku,pub_number,pages_en,status,old_id,created_by_id,process_id,updated_by_id,url_en,url_fr,dev_link_en,dev_link_fr,ekme_gcdocs_en,ekme_gcdocs_fr,cat_number_en,cat_number_fr,document_type_id,translation_status,pub_number_request_date,due_date,is_confirmed,lead_office_id,pages_fr,pdf_size_kb_en,pdf_size_kb_fr,cat_number_iku,library_link_en,library_link_fr,library_link_iku,pdf_size_kb_iku,isbn_en,isbn_fr,isbn_iku,pages_iku,ekme_gcdocs_iku,pub_number_assigned_date,posting_notification_sent_date,has_data_links,has_third_language,third_language,urgency_notes,urgent,media_attention,sharepoint_archive_en,sharepoint_archive_fr,status_display,document_type_display
