In [1]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine, text

# pandas formatting
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 200)

In [2]:
def sql(query):
    engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/dmapps_test')
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)
    return df

# search_strings = ['request', 'terms', 'process', 'meeting', 'region', 'section', 'sector', 'division']
# pattern = '|'.join(search_strings)
# tables[tables['table'].str.contains(pattern)]

all_tables = sql('SHOW TABLES').rename({'Tables_in_dmapps_test':'table'}, axis=1)
tables = all_tables[all_tables.table.str.contains('shared_models') | all_tables.table.str.contains('csas2')]

In [3]:
with open("compare_filter_results.pickle", 'rb') as f:
    f = pickle.load(f)

In [4]:
n_errors = f.pop('n_errors')
print(f'Number of Errors: {n_errors}')

Number of Errors: 41


In [5]:
df = pd.DataFrame(f).T.fillna(np.nan).replace(list(), np.nan)
df.head()

Unnamed: 0,region_filter,sector_filter,everything_matches,new_sector_choices,old_sector_choices,new_section_choices,old_section_choices,sector_new_only,sector_old_only,section_new_only,section_old_only
1,,,True,[],[],[],[],,,,
2,"[2, 4, 7, 6]",,True,"[(13, Aquatic Ecosystems (Maritimes)), (6, Ecosystems & Oceans Science (Maritimes)), (17, Fisheries & Harbour Management (Maritimes)), (21, Aquatic Ecosystems (Quebec)), (7, Ecosystems & Oceans Sc...","[(13, Aquatic Ecosystems (Maritimes)), (6, Ecosystems & Oceans Science (Maritimes)), (17, Fisheries & Harbour Management (Maritimes)), (21, Aquatic Ecosystems (Quebec)), (7, Ecosystems & Oceans Sc...","[(126, Maritimes - Aquatic Ecosystems - Ecosystems Management - Aquatic Invasive Species National Core Program), (124, Maritimes - Aquatic Ecosystems - Ecosystems Management - Fish and Fish Habita...","[(126, Maritimes - Aquatic Ecosystems - Ecosystems Management - Aquatic Invasive Species National Core Program), (124, Maritimes - Aquatic Ecosystems - Ecosystems Management - Fish and Fish Habita...",,,,
3,,,True,[],[],[],[],,,,
4,"[1, 6, 1, 4]","[6, 12, 7, 3]",False,"[(12, Aquatic Ecosystems (Gulf)), (5, Ecosystems & Oceans Science (Gulf)), (16, Fisheries & Harbour Management (Gulf)), (14, Strategic Policy (Gulf)), (1, Aquatic Ecosystems (National)), (4, Ecosy...","[(12, Aquatic Ecosystems (Gulf)), (5, Ecosystems & Oceans Science (Gulf)), (16, Fisheries & Harbour Management (Gulf)), (14, Strategic Policy (Gulf)), (1, Aquatic Ecosystems (National)), (4, Ecosy...","[(202, Arctic - Aquatic Ecosystems - Aquatic Ecosystems - Marine Planning and Conservation), (203, Arctic - Fisheries & Harbour Management - Fisheries & Harbour Management - Fisheries Resource Man...","[(106, Maritimes - BRANCH FOR SORTING (Science - Maritimes) - DIVISION FOR SORTING - TEMPORARY FOR SORTING), (61, Maritimes - BRANCH FOR SORTING (Science - Maritimes) - DIVISION FOR SORTING - TEMP...",[],[],"[(202, Arctic - Aquatic Ecosystems - Aquatic Ecosystems - Marine Planning and Conservation), (203, Arctic - Fisheries & Harbour Management - Fisheries & Harbour Management - Fisheries Resource Man...",[]
5,"[5, 9]",[4],False,"[(22, Aquatic Ecosystems (Newfoundland & Labrador)), (8, Ecosystems & Oceans Science (Newfoundland & Labrador)), (23, Fisheries & Harbour Management (Newfoundland & Labrador)), (24, Aquatic Ecosys...","[(22, Aquatic Ecosystems (Newfoundland & Labrador)), (8, Ecosystems & Oceans Science (Newfoundland & Labrador)), (23, Fisheries & Harbour Management (Newfoundland & Labrador)), (24, Aquatic Ecosys...","[(202, Arctic - Aquatic Ecosystems - Aquatic Ecosystems - Marine Planning and Conservation), (203, Arctic - Fisheries & Harbour Management - Fisheries & Harbour Management - Fisheries Resource Man...","[(107, National - BRANCH FOR SORTING - DIVISION FOR SORTING - TEMPORARY FOR SORTING), (168, National - Canadian Hydrographic Service - Marine Spatial Data Services and Licensing - Geodetic Enginee...",[],[],"[(202, Arctic - Aquatic Ecosystems - Aquatic Ecosystems - Marine Planning and Conservation), (203, Arctic - Fisheries & Harbour Management - Fisheries & Harbour Management - Fisheries Resource Man...",[]


In [6]:
df[df.everything_matches].isnull().sum()

region_filter           8
sector_filter          59
everything_matches      0
new_sector_choices      0
old_sector_choices      0
new_section_choices     0
old_section_choices     0
sector_new_only        59
sector_old_only        59
section_new_only       59
section_old_only       59
dtype: int64

In [7]:
df[~df.everything_matches].isnull().sum()

region_filter          1
sector_filter          0
everything_matches     0
new_sector_choices     0
old_sector_choices     0
new_section_choices    0
old_section_choices    0
sector_new_only        0
sector_old_only        0
section_new_only       0
section_old_only       0
dtype: int64

In [8]:
# it looks like any time there is a section filter, the old version doesn't work and they don't match
# we need to make sure A) that the new method is correct and B) that the new method filters correctly (match SQL)

# we should also check the stuff that actually is filtered, not just the errors
# need to update filter check script

# first, let's conver these listed tuples to lists of ids
columns = ['new_sector_choices', 'old_sector_choices', 'new_section_choices', 'old_section_choices', 'sector_new_only', 'sector_old_only', 'section_new_only', 'section_old_only']
for column in columns:
    df.loc[df[column].notnull(), column] = df.loc[df[column].notnull(), column].apply(lambda y: sorted([x[0] for x in y]))

In [11]:
# so many repeats
df[~df.everything_matches].iloc[:, [0,1,5,6]].head()

Unnamed: 0,region_filter,sector_filter,new_section_choices,old_section_choices
4,"[1, 6, 1, 4]","[6, 12, 7, 3]","[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 18, 18,...","[17, 18, 22, 23, 24, 25, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 46, 47, 48, 49, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 102, 105, 106, 120, 121, 122, 191, 232]"
5,"[5, 9]",[4],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, ...","[107, 154, 155, 156, 157, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 210, 211, 212, 213, 214]"
6,"[9, 5]","[34, 38]","[1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 11, 11, 12, 12, 13, 13, 14, 14, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 28, 28, 29, 29, 30, 30, 31, 3...",[]
7,"[4, 6]","[38, 16, 23, 7]","[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 18, 18,...","[28, 63, 64, 65, 66, 67, 68, 69, 102, 105, 116, 117, 118, 119, 208]"
10,"[8, 2, 1]","[3, 33, 34]","[1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22...",[191]


In [34]:
(df.iloc[3, 6] == sql(query).iloc[:, 0]).all()

True

In [41]:
# check row 4 for inconsistencies vs sql
region_filter, sector_filter = [1, 6, 1, 4], [6, 12, 7, 3]
region_filter, sector_filter = tuple(region_filter), tuple(sector_filter)

query=f"""
SELECT shared_models_section.id AS section_id FROM shared_models_section
	JOIN shared_models_division ON shared_models_division.id = shared_models_section.division_id
	JOIN shared_models_branch ON shared_models_branch.id = shared_models_division.branch_id
	JOIN shared_models_sector ON shared_models_sector.id = shared_models_branch.sector_id
	JOIN shared_models_region ON shared_models_region.id = shared_models_sector.region_id
WHERE 
    shared_models_region.id IN {region_filter} AND 
    shared_models_sector.id in {sector_filter}
GROUP BY shared_models_section.id
ORDER BY shared_models_section.id
"""

# mine is wrong, the old one is... uh... a bit also wrong?
print([x for x in sql(query).iloc[:, 0]])
print([x for x in df.iloc[3, 6]])

[17, 18, 63, 64, 65, 66, 67, 68, 69, 102, 105, 120, 121, 122, 191]
[17, 18, 22, 23, 24, 25, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 46, 47, 48, 49, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 102, 105, 106, 120, 121, 122, 191, 232]


In [42]:
# check row 4 for inconsistencies vs sql
region_filter, sector_filter = [1, 6, 1, 4], [6, 12, 7, 3]
region_filter, sector_filter = tuple(region_filter), tuple(sector_filter)

query=f"""
SELECT shared_models_section.id AS section_id FROM shared_models_section
	JOIN shared_models_division ON shared_models_division.id = shared_models_section.division_id
	JOIN shared_models_branch ON shared_models_branch.id = shared_models_division.branch_id
	JOIN shared_models_sector ON shared_models_sector.id = shared_models_branch.sector_id
	JOIN shared_models_region ON shared_models_region.id = shared_models_sector.region_id
WHERE 
    -- shared_models_region.id IN {region_filter} AND 
    shared_models_sector.id in {sector_filter}
GROUP BY shared_models_section.id
ORDER BY shared_models_section.id
"""

# in the old method, if there is a sector filter, it doesn't filter by region at all for some reason
# probably data validated to avoid this conflict
(df.iloc[3, 6] == sql(query).iloc[:, 0]).all()

True

In [None]:
# need a new test for only the section of code that is wrong