In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# no text wrapping
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 400)

In [2]:
df_SD = pd.read_pickle('df_SD.pickle')
df_FD = pd.read_pickle('df_FD.pickle')
df_LF = pd.read_pickle('df_LF.pickle')
df_LF_grouped = pd.read_pickle('df_LF_grouped.pickle')
df_Site = pd.read_pickle('df_Site.pickle')
df_TrapSupervisors = pd.read_pickle('df_TrapSupervisors.pickle')

# checking stuff

### why don't these match?
* answer: they do, but there is no matching sample to link them

In [3]:
df_LF[(df_LF.yy==2019) & (df_LF.mm==5) & (df_LF.dd==28) & (df_LF.site=="47")]

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE
11195,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,215,1,215,2019-05-28,47,,,2019052847,21.5,,,,True
11196,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,220,6,220,2019-05-28,47,,,2019052847,22.0,,,,True
11197,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,225,16,225,2019-05-28,47,,,2019052847,22.5,,,,True
11198,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,230,26,230,2019-05-28,47,,,2019052847,23.0,,,,True
11199,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,235,39,235,2019-05-28,47,,,2019052847,23.5,,,,True
11200,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,240,51,240,2019-05-28,47,,,2019052847,24.0,,,,True
11201,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,245,35,245,2019-05-28,47,,,2019052847,24.5,,,,True
11202,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,250,27,250,2019-05-28,47,,,2019052847,25.0,,,,True
11203,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,255,17,255,2019-05-28,47,,,2019052847,25.5,,,,True
11204,2019,5,28,,MARGAREE,5,47,UPPER,PM,80.9,36.696,260,5,260,2019-05-28,47,,,2019052847,26.0,,,,True


In [4]:
df_SD[(df_SD.DATETIME.dt.year==2019) & (df_SD.DATETIME.dt.month==5) & (df_SD.DATETIME.dt.day==28)]

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES
14946,2,SWMARG,Martin E Cameron,,81,12,1.0,2019,5,28,5,350.0,158.8,5,lower,Cameron,,,,,2019-05-28,12,,,2019052812,31.0,195.0,AM,71.9,,,,
14947,2,SWMARG,Robert Peters,,81,25,1.0,2019,5,28,5,25.0,11.3,8,lower,Peters,,,,,2019-05-28,25,,,2019052825,,,,,,,,
14948,2,SWMARG,Pierre Chiasson,,81,26,1.0,2019,5,28,5,200.0,90.7,7,lower,Chiasson,,,,,2019-05-28,26,,,2019052826,,,,,,,,
14949,2,SWMARG,Gerard MacFarlane,,81,28,1.0,2019,5,28,5,200.0,90.7,8,upper,MacFarlane,,,,1 perch,2019-05-28,28,,bycatch_other: 1 perch,2019052828,,,,,,,,
14950,2,SWMARG,Daniel Stewart,,81,33,1.0,2019,5,28,5,150.0,68.0,8,upper,Stewart,,,,,2019-05-28,33,,,2019052833,,,,,,,,
14951,2,SWMARG,Stewart Gillis,,81,41,1.0,2019,5,28,5,30.0,13.6,7,upper,Gillis,,,,,2019-05-28,41,,,2019052841,,,,,,,,
14952,2,SWMARG,Finley Stewart,,81,44,,2019,5,28,5,300.0,136.1,14,upper,Stewart,,,,,2019-05-28,44,,,2019052844,,,,,,,,
14953,2,SWMARG,Mary E Gillis,,81,49,1.0,2019,5,28,5,700.0,317.5,14,upper,Gillis,,,,,2019-05-28,49,,,2019052849,,,,,,,,
14954,2,SWMARG,Bruce MacLellan,,81,51,1.0,2019,5,28,5,15.0,6.8,6,upper,MacLellan,,,,,2019-05-28,51,,,2019052851,,,,,,,,
14955,2,SWMARG,Elizabeth MacKinnon,,81,62,1.0,2019,5,28,5,500.0,226.8,10,upper,MacKinnon,,,,,2019-05-28,62,,,2019052862,36.0,203.0,AM,74.1,,,,


### kevin followup question: how many ambiguous matches don't have a matching sample because there are multiple sample matches

In [5]:
ambiguous = set()
for i in list(df_SD[df_SD.id>2024000000].id):
    while i > 2024000000:
        i -= 1000000000
    ambiguous.add(i)
    
ambiguous = list(ambiguous)

In [6]:
# EXACT MATCHES = NONE

for ambiguous_sample in ambiguous:
    YEAR, MONTH, DAY, SITE = (
        ambiguous_sample//1000000, 
        ambiguous_sample//10000 - 100*(ambiguous_sample//1000000), 
        ambiguous_sample//100 - 100*(ambiguous_sample//10000), 
        ambiguous_sample - 100*(ambiguous_sample//100)
    )
    print()
    print(YEAR, MONTH, DAY, SITE)
    # no exact matches
    display(df_LF[(df_LF.yy==YEAR) & (df_LF.mm==MONTH) & (df_LF.dd==DAY) & (df_LF.site==str(SITE))])
    display(df_FD[(df_FD.YEAR==YEAR) & (df_FD.MM==MONTH) & (df_FD.DD==DAY) & (df_FD.SITE==SITE)])


2004 6 10 1


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE



1997 6 12 58


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE



1988 5 23 48


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE



1988 4 0 37


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


In [7]:
# POTENTIAL MATCHES WITH MESSY SITE NAMES = NONE

for ambiguous_sample in ambiguous:
    YEAR, MONTH, DAY, SITE = (
        ambiguous_sample//1000000, 
        ambiguous_sample//10000 - 100*(ambiguous_sample//1000000), 
        ambiguous_sample//100 - 100*(ambiguous_sample//10000), 
        ambiguous_sample - 100*(ambiguous_sample//100)
    )
    print()
    print(YEAR, MONTH, DAY, SITE)
    # partial matches
    display(df_LF[(df_LF.yy==YEAR) & (df_LF.mm==MONTH) & (df_LF.dd==DAY) & (df_LF.site.str.contains(str(SITE))==True)])
    display(df_FD[(df_FD.YEAR==YEAR) & (df_FD.MM==MONTH) & (df_FD.DD==DAY) & (df_FD.SITE.str.contains(str(SITE))==True)])


2004 6 10 1


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE



1997 6 12 58


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE



1988 5 23 48


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE



1988 4 0 37


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


In [8]:
# A FEW MATCHES WHEN SITE IS NOT CONSIDERED
# maybe there the site number was written down wrong, some partial matches (excluding site)

for ambiguous_sample in ambiguous:
    YEAR, MONTH, DAY, SITE = (
        ambiguous_sample//1000000, 
        ambiguous_sample//10000 - 100*(ambiguous_sample//1000000), 
        ambiguous_sample//100 - 100*(ambiguous_sample//10000), 
        ambiguous_sample - 100*(ambiguous_sample//100)
    )
    print()
    print(YEAR, MONTH, DAY, SITE)
    # very partial matches
    display(df_LF[(df_LF.yy==YEAR) & (df_LF.mm==MONTH) & (df_LF.dd==DAY)].head())
    display(df_FD[(df_FD.YEAR==YEAR) & (df_FD.MM==MONTH) & (df_FD.DD==DAY)].head())


2004 6 10 1


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE



1997 6 12 58


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE
3024,1997,6,12,,SW MARGAREE,7,26,LOWER,AM,118.0,53.5,220,3,220,1997-06-12,26,,,1997061226,22.0,,,,
3025,1997,6,12,,SW MARGAREE,7,26,LOWER,AM,118.0,53.5,225,4,225,1997-06-12,26,,,1997061226,22.5,,,,
3026,1997,6,12,,SW MARGAREE,7,26,LOWER,AM,118.0,53.5,230,19,230,1997-06-12,26,,,1997061226,23.0,,,,
3027,1997,6,12,,SW MARGAREE,7,26,LOWER,AM,118.0,53.5,235,52,235,1997-06-12,26,,,1997061226,23.5,,,,
3028,1997,6,12,,SW MARGAREE,7,26,LOWER,AM,118.0,53.5,240,70,240,1997-06-12,26,,,1997061226,24.0,,,,


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE
15290,1997,6,12,7,26,AM,Frozen,1,,230,238.0,171.0,A,M,4,,,3,3,,,,,,,,,,,1997-06-12,3,3,,,,,26,26,,,230,SITE_notes: 26; AGE_notes_1: 3; FSP_notes_1: 3,1997061226,,,,,,,,,,,,,,
15291,1997,6,12,7,26,AM,Frozen,2,,224,232.0,166.0,A,M,4,,,3,3,,,,,,,,,,,1997-06-12,3,3,,,,,26,26,,,224,SITE_notes: 26; AGE_notes_1: 3; FSP_notes_1: 3,1997061226,,,,,,,,,,,,,,
15292,1997,6,12,7,26,AM,Frozen,3,,272,280.0,307.0,B,F,4,39.9,,7,3,,,,,,,,,,,1997-06-12,7,3,,,,,26,26,,,272,SITE_notes: 26; AGE_notes_1: 7; FSP_notes_1: 3,1997061226,,,,,,,,,,,,,,
15293,1997,6,12,7,26,AM,Frozen,4,,212,220.0,138.0,A,M,4,,,3,3,,,,,,,,,,,1997-06-12,3,3,,,,,26,26,,,212,SITE_notes: 26; AGE_notes_1: 3; FSP_notes_1: 3,1997061226,,,,,,,,,,,,,,
15294,1997,6,12,7,26,AM,Frozen,5,,220,228.0,143.0,A,M,4,,,3,3,,,,,,,,,,,1997-06-12,3,3,,,,,26,26,,,220,SITE_notes: 26; AGE_notes_1: 3; FSP_notes_1: 3,1997061226,,,,,,,,,,,,,,



1988 5 23 48


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE
4503,1988,5,23,,12,,Fresh,1,253,,253.0,208.0,A,M,4,,,4,4,,,,,,,,,,,1988-05-23,4,4,,,,,12,12,,,253,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 4,1988052312,,,,,,,,,,,,,,
4504,1988,5,23,,12,,Fresh,2,258,,258.0,236.0,A,M,4,,,4,4,,,,,,,,,,,1988-05-23,4,4,,,,,12,12,,,258,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 4,1988052312,,,,,,,,,,,,,,
4505,1988,5,23,,12,,Fresh,3,245,,245.0,,A,M,4,,,4,4,,,,,,,,,,,1988-05-23,4,4,,,,,12,12,,,245,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 4,1988052312,,,,,,,,,,,,,,
4506,1988,5,23,,12,,Fresh,4,250,,250.0,234.0,A,M,4,,,4,4,,,,,,,,,,,1988-05-23,4,4,,,,,12,12,,,250,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 4,1988052312,,,,,,,,,,,,,,
4507,1988,5,23,,12,,Fresh,5,245,,245.0,209.0,A,F,4,29.1,,3,3,,,,,,,,,,,1988-05-23,3,3,,,,,12,12,,,245,SITE_notes: 12; AGE_notes_1: 3; FSP_notes_1: 3,1988052312,,,,,,,,,,,,,,



1988 4 0 37


Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


Unnamed: 0,YEAR,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


### how do we flag and leave the potential to merge ghost/ambiguous samples?

In [9]:
id = 1990061312
date = id // 100
date

19900613

In [10]:
from datetime import datetime
datetime.strptime(str(id//100), "%Y%m%d").date()

datetime.date(1990, 6, 13)

In [11]:
id = 4990061312
date = id // 100
while date > 20240000:
    date -= 10000000
datetime.strptime(str(date), "%Y%m%d").date()

datetime.date(1990, 6, 13)

In [12]:
sum(df_LF.id.isna()), sum(df_FD.id.isna())

(0, 0)

In [13]:
id - 100*(id//100)

12

# When making ghost samples, will any of our ambiguous id match between fish details and length frequencies?

In [14]:
ambiguous_FD = set(df_FD[df_FD.id>2024000000].id)
ambiguous_LF = set(df_LF[df_LF.id>2024000000].id)
ambiguous_SD = set(df_SD[df_SD.id>2024000000].id)
# yes, we need to further disambiguate: maybe add 20/40 to months, this number would never occur naturally

In [15]:
# let's make a summary of ambiguous dates

df_A_FD = pd.DataFrame(ambiguous_FD, columns=['id_FD'])
df_A_FD['date'] = df_A_FD['id_FD'] // 100
while df_A_FD.date.max() > 20240000:
    df_A_FD.loc[df_A_FD.date>20240000, 'date'] -= 10000000
    
df_A_LF = pd.DataFrame(ambiguous_LF, columns=['id_LF'])
df_A_LF['date'] = df_A_LF['id_LF'] // 100
while df_A_LF.date.max() > 20240000:
    df_A_LF.loc[df_A_LF.date>20240000, 'date'] -= 10000000
    
df_A_SD = pd.DataFrame(ambiguous_SD, columns=['id_SD'])
df_A_SD['date'] = df_A_SD['id_SD'] // 100
while df_A_SD.date.max() > 20240000:
    df_A_SD.loc[df_A_SD.date>20240000, 'date'] -= 10000000

# group and split into columns
df_A_FD = df_A_FD.groupby('date')['id_FD'].apply(lambda x: pd.Series(x.values)).unstack()
df_A_FD.columns = ['id_FD' for _ in range(df_A_FD.shape[1])]

df_A_LF = df_A_LF.groupby('date')['id_LF'].apply(lambda x: pd.Series(x.values)).unstack()
df_A_LF.columns = ['id_LF' for _ in range(df_A_LF.shape[1])]

df_A_SD = df_A_SD.groupby('date')['id_SD'].apply(lambda x: pd.Series(x.values)).unstack()
df_A_SD.columns = ['id_SD' for _ in range(df_A_SD.shape[1])]

# create a dataframe summarising ambiguous data
df_AMBIGUOUS = pd.merge(
    df_A_FD,
    pd.merge(
        df_A_SD,
        df_A_LF,
        on='date', 
        how='outer'
    ),
    on='date', 
    how='outer'
).sort_index().astype('Int64')

df_AMBIGUOUS.loc[df_AMBIGUOUS['id_FD'].any(axis=1), 'FLAG_FD_AMBIGUOUS'] = True
df_AMBIGUOUS.loc[df_AMBIGUOUS['id_LF'].any(axis=1), 'FLAG_LF_AMBIGUOUS'] = True
df_AMBIGUOUS.loc[df_AMBIGUOUS['id_SD'].any(axis=1), 'FLAG_SD_AMBIGUOUS'] = True

df_AMBIGUOUS['FLAG_AMBIGUITY_OVERLAP'] = df_AMBIGUOUS.FLAG_FD_AMBIGUOUS.notnull()*1 + df_AMBIGUOUS.FLAG_LF_AMBIGUOUS.notnull()*1 + df_AMBIGUOUS.FLAG_SD_AMBIGUOUS.notnull()*1

In [16]:
# these are our problems if we are making ghost samples

df_AMBIGUOUS.loc[df_AMBIGUOUS.FLAG_AMBIGUITY_OVERLAP > 1].dropna(axis=1).T

date,19980501,20000607,20000609,20010516,20090520,20090522,20100511,20100519,20100527
id_FD,2998050105,3000060726,3000060926,3001051605,3009052026,3009052226,3010051126,3010051926,3010052726
id_LF,2998050105,4000060726,4000060926,3001051605,3009052026,3009052226,3010051126,3010051926,3010052726
id_LF,3998050105,3000060726,3000060926,4001051605,4009052026,4009052226,4010051126,4010051926,4010052726
FLAG_FD_AMBIGUOUS,True,True,True,True,True,True,True,True,True
FLAG_LF_AMBIGUOUS,True,True,True,True,True,True,True,True,True
FLAG_AMBIGUITY_OVERLAP,2,2,2,2,2,2,2,2,2


# Potential Sorting Issues? LF? FD?
* Yes, can't disambiguate without making guesses. Left flagged, and will be rejected on import.

In [17]:
for i in ambiguous_FD:
    display(df_FD[df_FD.id==i][['YEAR', 'MM', 'DD', 'SITE', 'PERIOD', 'FISH_NO', 'id']].head(1))

Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
5501,1989,5,14,12,AM,1,2989051412


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
5549,1989,5,14,12,PM,1,3989051412


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
11142,1993,5,29,52,AM,1,2993052952


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
18241,2000,6,7,26,AM,1,3000060726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
18277,2000,6,7,26,PM,1,4000060726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
24609,2010,5,11,26,AM,1,3010051126


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
15403,1998,5,1,5,PM,1,2998050105


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
24638,2010,5,11,26,PM,1,4010051126


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
29465,2014,5,30,47,AM,1,3014053047


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
29495,2014,5,30,47 or 62,PM,1,4014053047


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
25292,2010,5,29,25,PM,1,3010052925


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
11030,1993,5,27,37,AM,1,2993052737


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
23846,2009,5,22,26,AM,1,3009052226


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
23877,2009,5,22,26,PM,1,4009052226


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
11487,1993,6,9,33,AM,1,2993060933


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
5455,1989,5,13,35,PM,1,3989051335


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
5409,1989,5,13,35,AM,1,2989051335


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
23753,2009,5,20,26,AM,1,3009052026


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
18501,2001,5,16,526,AM,1,3001051605


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
18548,2001,5,16,5,PM,1,4001051605


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
18322,2000,6,9,26,AM,1,3000060926


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
24792,2010,5,19,26,AM,1,3010051926


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
24829,2010,5,19,26,PM,1,4010051926


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
25640,2010,6,11,41,AM,1,3010061141


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
24513,2010,5,5,41,PM,1,3010050541


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
25140,2010,5,27,26,AM,1,3010052726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
25169,2010,5,27,26,PM,1,4010052726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
23782,2009,5,20,26,PM,1,4009052026


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO,id
18356,2000,6,9,26,PM,1,4000060926


In [18]:
# CONFIRM
# some have am / pm that could be used to match with LF manually if that info is there as well???
for i in ambiguous_FD:
    print('\n', i)
    display(df_FD[df_FD.id==i][['YEAR', 'MM', 'DD', 'SITE', 'PERIOD', 'FISH_NO']].head())


 2989051412


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
5501,1989,5,14,12,AM,1
5502,1989,5,14,12,AM,2
5503,1989,5,14,12,AM,3
5504,1989,5,14,12,AM,4
5505,1989,5,14,12,AM,5



 3989051412


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
5549,1989,5,14,12,PM,1
5550,1989,5,14,12,PM,2
5551,1989,5,14,12,PM,3
5552,1989,5,14,12,PM,4
5553,1989,5,14,12,PM,5



 2993052952


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
11142,1993,5,29,52,AM,1
11143,1993,5,29,52,AM,2
11144,1993,5,29,52,AM,3
11145,1993,5,29,52,AM,4
11146,1993,5,29,52,AM,5



 3000060726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
18241,2000,6,7,26,AM,1
18242,2000,6,7,26,AM,2
18243,2000,6,7,26,AM,3
18244,2000,6,7,26,AM,4
18245,2000,6,7,26,AM,5



 4000060726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
18277,2000,6,7,26,PM,1
18278,2000,6,7,26,PM,2
18279,2000,6,7,26,PM,3
18280,2000,6,7,26,PM,4
18281,2000,6,7,26,PM,5



 3010051126


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
24609,2010,5,11,26,AM,1
24610,2010,5,11,26,AM,2
24611,2010,5,11,26,AM,3
24612,2010,5,11,26,AM,4
24613,2010,5,11,26,AM,5



 2998050105


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
15403,1998,5,1,5,PM,1
15404,1998,5,1,5,PM,1
15405,1998,5,1,5,PM,2
15406,1998,5,1,5,PM,2
15407,1998,5,1,5,PM,3



 4010051126


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
24638,2010,5,11,26,PM,1
24639,2010,5,11,26,PM,2
24640,2010,5,11,26,PM,3
24641,2010,5,11,26,PM,4
24642,2010,5,11,26,PM,5



 3014053047


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
29465,2014,5,30,47,AM,1
29466,2014,5,30,47,AM,2
29467,2014,5,30,47,AM,3
29468,2014,5,30,47,AM,4
29469,2014,5,30,47,AM,5



 4014053047


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
29495,2014,5,30,47 or 62,PM,1
29496,2014,5,30,47 or 62,PM,2
29497,2014,5,30,47 or 62,PM,3
29498,2014,5,30,47 or 62,PM,4
29499,2014,5,30,47 or 62,PM,5



 3010052925


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
25292,2010,5,29,25,PM,1
25293,2010,5,29,25,PM,2
25294,2010,5,29,25,PM,3
25295,2010,5,29,25,PM,4
25296,2010,5,29,25,PM,5



 2993052737


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
11030,1993,5,27,37,AM,1
11031,1993,5,27,37,AM,2
11032,1993,5,27,37,AM,3
11033,1993,5,27,37,AM,4
11034,1993,5,27,37,AM,5



 3009052226


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
23846,2009,5,22,26,AM,1
23847,2009,5,22,26,AM,2
23848,2009,5,22,26,AM,3
23849,2009,5,22,26,AM,4
23850,2009,5,22,26,AM,5



 4009052226


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
23877,2009,5,22,26,PM,1
23878,2009,5,22,26,PM,2
23879,2009,5,22,26,PM,3
23880,2009,5,22,26,PM,4
23881,2009,5,22,26,PM,5



 2993060933


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
11487,1993,6,9,33,AM,1
11488,1993,6,9,33,AM,2
11489,1993,6,9,33,AM,3
11490,1993,6,9,33,AM,4
11491,1993,6,9,33,AM,5



 3989051335


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
5455,1989,5,13,35,PM,1
5456,1989,5,13,35,PM,2
5457,1989,5,13,35,PM,3
5458,1989,5,13,35,PM,4
5459,1989,5,13,35,PM,5



 2989051335


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
5409,1989,5,13,35,AM,1
5410,1989,5,13,35,AM,2
5411,1989,5,13,35,AM,3
5412,1989,5,13,35,AM,4
5413,1989,5,13,35,AM,5



 3009052026


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
23753,2009,5,20,26,AM,1
23754,2009,5,20,26,AM,2
23755,2009,5,20,26,AM,3
23756,2009,5,20,26,AM,4
23757,2009,5,20,26,AM,5



 3001051605


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
18501,2001,5,16,526,AM,1
18502,2001,5,16,526,AM,2
18503,2001,5,16,526,AM,3
18504,2001,5,16,526,AM,4
18505,2001,5,16,526,AM,5



 4001051605


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
18548,2001,5,16,5,PM,1
18549,2001,5,16,5,PM,2
18550,2001,5,16,5,PM,3
18551,2001,5,16,5,PM,4
18552,2001,5,16,5,PM,5



 3000060926


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
18322,2000,6,9,26,AM,1
18323,2000,6,9,26,AM,2
18324,2000,6,9,26,AM,3
18325,2000,6,9,26,AM,4
18326,2000,6,9,26,AM,5



 3010051926


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
24792,2010,5,19,26,AM,1
24793,2010,5,19,26,AM,2
24794,2010,5,19,26,AM,3
24795,2010,5,19,26,AM,4
24796,2010,5,19,26,AM,5



 4010051926


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
24829,2010,5,19,26,PM,1
24830,2010,5,19,26,PM,2
24831,2010,5,19,26,PM,3
24832,2010,5,19,26,PM,4
24833,2010,5,19,26,PM,5



 3010061141


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
25640,2010,6,11,41,AM,1
25641,2010,6,11,41,AM,2
25642,2010,6,11,41,AM,3
25643,2010,6,11,41,AM,4
25644,2010,6,11,41,AM,5



 3010050541


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
24513,2010,5,5,41,PM,1
24514,2010,5,5,41,PM,2
24515,2010,5,5,41,PM,3
24516,2010,5,5,41,PM,4
24517,2010,5,5,41,PM,5



 3010052726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
25140,2010,5,27,26,AM,1
25141,2010,5,27,26,AM,2
25142,2010,5,27,26,AM,3
25143,2010,5,27,26,AM,4
25144,2010,5,27,26,AM,5



 4010052726


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
25169,2010,5,27,26,PM,1
25170,2010,5,27,26,PM,2
25171,2010,5,27,26,PM,3
25172,2010,5,27,26,PM,4
25173,2010,5,27,26,PM,5



 4009052026


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
23782,2009,5,20,26,PM,1
23783,2009,5,20,26,PM,2
23784,2009,5,20,26,PM,3
23785,2009,5,20,26,PM,4
23786,2009,5,20,26,PM,5



 4000060926


Unnamed: 0,YEAR,MM,DD,SITE,PERIOD,FISH_NO
18356,2000,6,9,26,PM,1
18357,2000,6,9,26,PM,2
18358,2000,6,9,26,PM,3
18359,2000,6,9,26,PM,4
18360,2000,6,9,26,PM,5


In [19]:
# check ambiguous AM PM for LF and FD and look for extra matches (NOTE, none match samples/logbook)
# [x for x in df_AMBIGUOUS[['id_LF']].values.T.ravel().tolist() if not pd.isna(x)]

period_check = pd.merge(
    df_FD.loc[df_FD.id.isin([x for x in df_AMBIGUOUS[['id_FD']].values.T.ravel().tolist() if not pd.isna(x)])].groupby(['id', 'PERIOD']).count().reset_index()[['id', 'PERIOD']],
    df_LF.loc[df_LF.id.isin([x for x in df_AMBIGUOUS[['id_LF']].values.T.ravel().tolist() if not pd.isna(x)])].groupby(['id', 'period']).count().reset_index()[['id', 'period']],
    on='id',
    how='outer'
)

period_check['id'] -= 1000000000
period_check.loc[period_check.id > 2024000000, 'id'] -= 1000000000

period_check = period_check.sort_values(['id', 'PERIOD']).reset_index(drop=True)

In [20]:
id_period_ambiguity = list(period_check.dropna()[period_check.dropna().duplicated('id', keep=False)].id.unique())

check_ids = [x+1000000000 for x in id_period_ambiguity] + [x+2000000000 for x in id_period_ambiguity]

In [21]:
df_LF[df_LF.id.isin(check_ids)].groupby(['id', 'period']).first().sort_values('DATETIME')

Unnamed: 0_level_0,Unnamed: 1_level_0,yy,mm,dd,Time,river,week,site,loc,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE
id,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3000060726,AM,2000,6,7,,SW MARGAREE,6,26,LOWER,158.0,71.7,215,1,215,2000-06-07,26,,,21.5,,,,
4000060726,PM,2000,6,7,,SW MARGAREE,6,26,LOWER,158.0,71.7,215,3,215,2000-06-07,26,,,21.5,,,,
3000060926,AM,2000,6,9,,SW MARGAREE,6,26,LOWER,146.0,66.2,215,1,215,2000-06-09,26,,,21.5,,,,
4000060926,PM,2000,6,9,,SW MARGAREE,6,26,LOWER,141.0,64.0,215,3,215,2000-06-09,26,,,21.5,,,,
3001051605,AM,2001,5,16,,SW MARGAREE,3,526,LOWER,140.0,63.5,220,1,220,2001-05-16,5,26.0,,22.0,True,,,
4001051605,PM,2001,5,16,,SW MARGAREE,3,5,LOWER,154.0,69.9,225,2,225,2001-05-16,5,,,22.5,,,,
3009052026,AM,2009,5,20,1255.0,SW MARGAREE,4,26,LOWER,139.0,63.0,220,1,220,2009-05-20,26,,,22.0,,,,
4009052026,PM,2009,5,20,1405.0,SW MARGAREE,4,26,LOWER,96.0,43.5,225,1,225,2009-05-20,26,,,22.5,,,,
3009052226,AM,2009,5,22,1310.0,SW MARGAREE,4,26,LOWER,128.0,58.1,225,2,225,2009-05-22,26,,,22.5,,,,
4009052226,PM,2009,5,22,1455.0,SW MARGAREE,4,26,LOWER,130.0,59.0,225,3,225,2009-05-22,26,,,22.5,,,,


In [22]:
df_FD[df_FD.id.isin(check_ids)].groupby(['id', 'PERIOD']).first().sort_values('DATETIME')

Unnamed: 0_level_0,Unnamed: 1_level_0,YEAR,MM,DD,WEEK,SITE,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,fish_length,remarks,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE
id,PERIOD,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1
3000060726,AM,2000,6,7,6,26,Frozen,1,,250.0,258.0,220.0,B,M,4,32.1,,3,9,,,,,,,,,,,2000-06-07,3,9,,,,,26,26,,,250,SITE_notes: 26; FSP_notes_1: 9,,,,,,,,,,True,,,,
4000060726,PM,2000,6,7,6,26,Fresh,1,252.0,,252.0,226.0,B,F,4,28.8,,3,9,,,,,,,,,,,2000-06-07,3,9,,,,,26,26,,,252,SITE_notes: 26; FSP_notes_1: 9,,,,,,,,,,True,,,,
3000060926,AM,2000,6,9,6,26,Fresh,1,239.0,,239.0,168.0,B,M,4,32.7,,3,0,,,,,,,,,,,2000-06-09,3,0,,,,,26,26,,,239,SITE_notes: 26; FSP_notes_1: 0,,,,,,,,,,True,,,,
4000060926,PM,2000,6,9,6,26,Frozen,1,,266.0,274.0,284.5,A,F,4,39.1,,6,4,,,,,,,,,,,2000-06-09,6,4,,,,,26,26,,,266,SITE_notes: 26; AGE_notes_1: 6; FSP_notes_1: 4,,,,,,,,,,True,,,,
3001051605,AM,2001,5,16,3,526,Frozen,1,,271.0,279.0,288.0,A,M,4,48.3,,6,3,,,,,,,,,,,2001-05-16,6,3,,,,,526,5,26.0,,271,"SITE_notes: 5,26; AGE_notes_1: 6; FSP_notes_1: 3",True,,,,,,,,,True,,,,
4001051605,PM,2001,5,16,3,5,Frozen,1,,276.0,285.0,312.6,A,F,4,53.8,,6,4,,,,,,,,,,,2001-05-16,6,4,,,,,5,5,,,276,SITE_notes: 5; AGE_notes_1: 6; FSP_notes_1: 4,,,,,,,,,,True,,,,
3009052026,AM,2009,5,20,4,26,Fresh,1,245.0,269.0,245.0,202.5,A,M,4,30.2,,3,3,,,,,,,,,,,2009-05-20,3,3,,,,,26,26,,,245,SITE_notes: 26,,,,,,,,,,True,,,,
4009052026,PM,2009,5,20,4,26,Frozen,1,,243.0,251.0,200.8,A,M,4,44.4,,4,3,,,,,,,,,,,2009-05-20,4,3,,,,,26,26,,,243,SITE_notes: 26; AGE_notes_1: 4; FSP_notes_1: 3,,,,,,,,,,True,,,,
3009052226,AM,2009,5,22,4,26,Fresh,1,232.0,,232.0,156.0,A,M,4,38.8,,3,3,,,,,,,,,,,2009-05-22,3,3,,,,,26,26,,,232,SITE_notes: 26; AGE_notes_1: 3; FSP_notes_1: 3,,,,,,,,,,True,,,,
4009052226,PM,2009,5,22,4,26,Frozen,1,,226.0,234.0,160.2,A,M,4,33.0,,3,3,,,,,,,,,,,2009-05-22,3,3,,,,,26,26,,,226,SITE_notes: 26; AGE_notes_1: 3; FSP_notes_1: 3,,,,,,,,,,True,,,,


4 ambiguous samples may be linked:

    3010051926	AM
    3010051926	AM
    
    4010051926	PM
    4010051926	PM
    
    3010052726	AM
    3010052726	AM
    
    4010052726	PM
    4010052726	PM
    
Note: these are matched together correctly.

* Therefore, 

        if id.isin([3010051926, 4010051926, 3010052726, 4010052726]):
            don't add anything to months to disambiguate 
            # these id will match up correctly and link to the same ghost sample

* Other id from this set do not match unambiguously using period. There are some inconsistencies.
* In fact, period may not be entirely useful because of the inconsistencies with within ambiguous sets of fish details.

# More Sorting Issue Calcs and Checks

In [23]:
# loop through duplicated ids and look at these by inspection - make sure they all look doubled (except 2018061364)
# the only one that looks like a typo is 2018061364 (fixed at "CONFIRM: fix typo at 10851" in noteboook 10.1)

# from notebook 10.1
duplicated_ids_LF = [1998050105, 2000060726, 2000060926, 2000061538, 2001051605, 2003052326,
                     2009052026, 2009052226, 2010051126, 2010051926, 2010052726, 2018061364]

temp = df_LF.copy()
temp.loc[temp.id>2024000000, 'id'] -= 1000000000
temp.loc[temp.id>2024000000, 'id'] -= 1000000000

for duplicate_id in duplicated_ids_LF:
    idx = temp[temp.id == duplicate_id].index.values[0]
    
    before = 0
    after = temp[temp.id == duplicate_id].shape[0]-1
    
    print('\n', temp[temp.id == duplicate_id]['id'].values[0])
    display(temp.loc[idx-before:idx+after][['id', 'DATETIME', 'site', 'period', 'lgth', 'wt_lbs']])



 1998050105


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
3102,1998050105,1998-05-01,5,PM,235,10.0
3103,1998050105,1998-05-01,5,PM,240,10.0
3104,1998050105,1998-05-01,5,PM,245,10.0
3105,1998050105,1998-05-01,5,PM,250,10.0
3106,1998050105,1998-05-01,5,PM,255,10.0
3107,1998050105,1998-05-01,5,PM,260,10.0
3108,1998050105,1998-05-01,5,PM,265,10.0
3109,1998050105,1998-05-01,5,PM,270,10.0
3110,1998050105,1998-05-01,5,PM,275,10.0
3111,1998050105,1998-05-01,5,PM,280,10.0



 2000060726


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
4156,2000060726,2000-06-07,26,AM,215,158.0
4157,2000060726,2000-06-07,26,AM,220,158.0
4158,2000060726,2000-06-07,26,AM,225,158.0
4159,2000060726,2000-06-07,26,AM,230,158.0
4160,2000060726,2000-06-07,26,AM,235,158.0
4161,2000060726,2000-06-07,26,AM,240,158.0
4162,2000060726,2000-06-07,26,AM,245,158.0
4163,2000060726,2000-06-07,26,AM,250,158.0
4164,2000060726,2000-06-07,26,AM,255,158.0
4165,2000060726,2000-06-07,26,AM,260,158.0



 2000060926


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
4186,2000060926,2000-06-09,26,PM,215,141.0
4187,2000060926,2000-06-09,26,PM,220,141.0
4188,2000060926,2000-06-09,26,PM,225,141.0
4189,2000060926,2000-06-09,26,PM,230,141.0
4190,2000060926,2000-06-09,26,PM,235,141.0
4191,2000060926,2000-06-09,26,PM,240,141.0
4192,2000060926,2000-06-09,26,PM,245,141.0
4193,2000060926,2000-06-09,26,PM,250,141.0
4194,2000060926,2000-06-09,26,PM,255,141.0
4195,2000060926,2000-06-09,26,PM,260,141.0



 2000061538


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
4239,2000061538,2000-06-15,38,PM,205,139.0
4240,2000061538,2000-06-15,38,PM,210,139.0
4241,2000061538,2000-06-15,38,PM,215,139.0
4242,2000061538,2000-06-15,38,PM,220,139.0
4243,2000061538,2000-06-15,38,PM,225,139.0
4244,2000061538,2000-06-15,38,PM,230,139.0
4245,2000061538,2000-06-15,38,PM,235,139.0
4246,2000061538,2000-06-15,38,PM,240,139.0
4247,2000061538,2000-06-15,38,PM,245,139.0
4248,2000061538,2000-06-15,38,PM,250,139.0



 2001051605


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
4266,2001051605,2001-05-16,526,AM,220,140.0
4267,2001051605,2001-05-16,526,AM,225,140.0
4268,2001051605,2001-05-16,526,AM,230,140.0
4269,2001051605,2001-05-16,526,AM,235,140.0
4270,2001051605,2001-05-16,526,AM,240,140.0
4271,2001051605,2001-05-16,526,AM,245,140.0
4272,2001051605,2001-05-16,526,AM,250,140.0
4273,2001051605,2001-05-16,526,AM,255,140.0
4274,2001051605,2001-05-16,526,AM,260,140.0
4275,2001051605,2001-05-16,526,AM,265,140.0



 2003052326


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
5184,2003052326,2003-05-23,26,AM,230,30.0
5185,2003052326,2003-05-23,26,AM,235,30.0
5186,2003052326,2003-05-23,26,AM,240,30.0
5187,2003052326,2003-05-23,26,AM,245,30.0
5188,2003052326,2003-05-23,26,AM,250,30.0
5189,2003052326,2003-05-23,26,AM,255,30.0
5190,2003052326,2003-05-23,26,AM,260,30.0
5191,2003052326,2003-05-23,26,AM,265,30.0
5192,2003052326,2003-05-23,26,AM,270,30.0
5193,2003052326,2003-05-23,26,AM,275,30.0



 2009052026


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
6332,2009052026,2009-05-20,26,AM,220,139.0
6333,2009052026,2009-05-20,26,AM,225,139.0
6334,2009052026,2009-05-20,26,AM,230,139.0
6335,2009052026,2009-05-20,26,AM,235,139.0
6336,2009052026,2009-05-20,26,AM,240,139.0
6337,2009052026,2009-05-20,26,AM,245,139.0
6338,2009052026,2009-05-20,26,AM,250,139.0
6339,2009052026,2009-05-20,26,AM,255,139.0
6340,2009052026,2009-05-20,26,AM,260,139.0
6341,2009052026,2009-05-20,26,AM,265,139.0



 2009052226


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
6368,2009052226,2009-05-22,26,AM,225,128.0
6369,2009052226,2009-05-22,26,AM,230,128.0
6370,2009052226,2009-05-22,26,AM,235,128.0
6371,2009052226,2009-05-22,26,AM,240,128.0
6372,2009052226,2009-05-22,26,AM,245,128.0
6373,2009052226,2009-05-22,26,AM,250,128.0
6374,2009052226,2009-05-22,26,AM,255,128.0
6375,2009052226,2009-05-22,26,AM,260,128.0
6376,2009052226,2009-05-22,26,AM,265,128.0
6377,2009052226,2009-05-22,26,AM,270,128.0



 2010051126


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
6648,2010051126,2010-05-11,26,AM,230,43.0
6649,2010051126,2010-05-11,26,AM,235,43.0
6650,2010051126,2010-05-11,26,AM,240,43.0
6651,2010051126,2010-05-11,26,AM,245,43.0
6652,2010051126,2010-05-11,26,AM,250,43.0
6653,2010051126,2010-05-11,26,AM,255,43.0
6654,2010051126,2010-05-11,26,AM,260,43.0
6655,2010051126,2010-05-11,26,AM,265,43.0
6656,2010051126,2010-05-11,26,AM,270,43.0
6657,2010051126,2010-05-11,26,AM,275,43.0



 2010051926


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
6719,2010051926,2010-05-19,26,AM,225,129.0
6720,2010051926,2010-05-19,26,AM,230,129.0
6721,2010051926,2010-05-19,26,AM,235,129.0
6722,2010051926,2010-05-19,26,AM,240,129.0
6723,2010051926,2010-05-19,26,AM,245,129.0
6724,2010051926,2010-05-19,26,AM,250,129.0
6725,2010051926,2010-05-19,26,AM,255,129.0
6726,2010051926,2010-05-19,26,AM,260,129.0
6727,2010051926,2010-05-19,26,AM,265,129.0
6728,2010051926,2010-05-19,26,AM,270,129.0



 2010052726


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
6846,2010052726,2010-05-27,26,AM,225,117.0
6847,2010052726,2010-05-27,26,AM,230,117.0
6848,2010052726,2010-05-27,26,AM,235,117.0
6849,2010052726,2010-05-27,26,AM,240,117.0
6850,2010052726,2010-05-27,26,AM,245,117.0
6851,2010052726,2010-05-27,26,AM,250,117.0
6852,2010052726,2010-05-27,26,AM,255,117.0
6853,2010052726,2010-05-27,26,AM,260,117.0
6854,2010052726,2010-05-27,26,AM,265,117.0
6855,2010052726,2010-05-27,26,AM,270,117.0



 2018061364


Unnamed: 0,id,DATETIME,site,period,lgth,wt_lbs
10841,2018061364,2018-06-13,64,PM,215,92.0
10842,2018061364,2018-06-13,64,PM,220,92.0
10843,2018061364,2018-06-13,64,PM,225,92.0
10844,2018061364,2018-06-13,64,PM,230,92.0
10845,2018061364,2018-06-13,64,PM,235,92.0
10846,2018061364,2018-06-13,64,PM,240,92.0
10847,2018061364,2018-06-13,64,PM,245,92.0
10848,2018061364,2018-06-13,64,PM,250,92.0
10849,2018061364,2018-06-13,64,PM,255,92.0
10850,2018061364,2018-06-13,64,PM,260,92.0


In [24]:
# it looks like 4018061364 is our only potential error (fixed earlier, now 2018061364)
df_LF.groupby('id').count().sort_values('yy')['yy'].head()

id
2007060235    1
1991051212    3
1994050848    4
2005050501    6
1991051312    6
Name: yy, dtype: int64

In [25]:
# loop through small counts above and look at them
# these look fine, unless there were multiple combined typos
smallgroups = list(df_LF.groupby('id').count().sort_values('yy')['yy'].head().index)

for group in smallgroups:
    idx = df_LF[df_LF.id == group].index.values[0]
    before = 2
    after = 8
    
    print('\n', df_LF[df_LF.id == group]['id'].values[0])
    display(df_LF.loc[idx-before:idx+after][['DATETIME', 'site', 'period', 'lgth', 'wt_lbs']])



 2007060235


Unnamed: 0,DATETIME,site,period,lgth,wt_lbs
5960,2007-06-01,47,PM,285,133.0
5961,2007-06-01,47,PM,295,133.0
5962,2007-06-02,35,PM,225,122.0
5963,2007-06-02,2,PM,230,122.0
5964,2007-06-02,2,PM,235,122.0
5965,2007-06-02,2,PM,240,122.0
5966,2007-06-02,2,PM,245,122.0
5967,2007-06-02,2,PM,250,122.0
5968,2007-06-02,2,PM,255,122.0
5969,2007-06-02,2,PM,260,122.0



 1991051212


Unnamed: 0,DATETIME,site,period,lgth,wt_lbs
946,1990-06-08,64,AM,285,
947,1990-06-08,64,AM,293,
948,1991-05-12,12,AM,265,1.0
949,1991-05-12,12,AM,270,1.0
950,1991-05-12,12,AM,275,1.0
951,1991-05-13,12,AM,265,11.0
952,1991-05-13,12,AM,270,11.0
953,1991-05-13,12,AM,275,11.0
954,1991-05-13,12,AM,280,11.0
955,1991-05-13,12,AM,285,11.0



 1994050848


Unnamed: 0,DATETIME,site,period,lgth,wt_lbs
10341,1994-06-01,41,AM,280,137.0
10342,1994-06-01,41,AM,300,137.0
10343,1994-05-08,48,AM,250,3.0
10344,1994-05-08,48,AM,255,3.0
10345,1994-05-08,48,AM,260,3.0
10346,1994-05-08,48,AM,265,3.0
10347,1994-06-02,49,PM,235,118.0
10348,1994-06-02,49,PM,240,118.0
10349,1994-06-02,49,PM,245,118.0
10350,1994-06-02,49,PM,250,118.0



 2005050501


Unnamed: 0,DATETIME,site,period,lgth,wt_lbs
5570,2004-06-16,5,PM,260,41.0
5571,2004-06-16,5,PM,265,41.0
5572,2005-05-05,1,PM,240,9.0
5573,2005-05-05,1,PM,250,9.0
5574,2005-05-05,1,PM,255,9.0
5575,2005-05-05,1,PM,260,9.0
5576,2005-05-05,1,PM,265,9.0
5577,2005-05-05,1,PM,270,9.0
5578,2005-05-07,26,PM,240,12.0
5579,2005-05-07,26,PM,245,12.0



 1991051312


Unnamed: 0,DATETIME,site,period,lgth,wt_lbs
949,1991-05-12,12,AM,270,1.0
950,1991-05-12,12,AM,275,1.0
951,1991-05-13,12,AM,265,11.0
952,1991-05-13,12,AM,270,11.0
953,1991-05-13,12,AM,275,11.0
954,1991-05-13,12,AM,280,11.0
955,1991-05-13,12,AM,285,11.0
956,1991-05-13,12,AM,290,11.0
957,1991-05-16,12,PM,255,11.0
958,1991-05-16,12,PM,260,11.0


# Trap Supervisors Linked to Sites

In [26]:
# looks correct

RELEVANT_SUPERVISORS = df_SD.groupby(['NAME', 'SITE1']).count().reset_index()[['SITE1', 'NAME']].sort_values('SITE1').reset_index(drop=True).rename({'SITE1': 'site'}, axis=1)

for site in RELEVANT_SUPERVISORS.site.unique():
    print('\nSite', site, '--', ', '.join(list(RELEVANT_SUPERVISORS.query('site == @site').NAME)))


Site 1 -- Marilyn Gillis, Charles McDaniel

Site 2 -- Richard Gillis

Site 4 -- Charles McDaniel

Site 5 -- Anthony Cameron

Site 6 -- Irene MacIsaac

Site 7 -- Eleanor MacLellan

Site 8 -- Colin Gillis

Site 9 -- John A Chisholm

Site 11 -- Darlene Cameron, Josh Fraser, Joshua Fraser

Site 12 -- Martin E Cameron

Site 15 -- Pat Chisholm, John A Chisholm

Site 17 -- Pat Chisholm, John A Chisholm

Site 18 -- Eleanor McDaniel, Brian Doyle

Site 19 -- John A Chisholm

Site 20 -- John Coady

Site 21 -- Jim Coady

Site 23 -- John A Coady

Site 24 -- James A Hirtle

Site 25 -- Robert Peters

Site 26 -- Gerard V Chiasson, Pierre Chiasson

Site 27 -- Leo Chiasson

Site 28 -- Gerard MacFarlane

Site 29 -- Donald MacLeod

Site 30 -- Robert MacLeod, Donald MacLeod

Site 31 -- Robert MacLeod

Site 32 -- Robert MacLeod, Catherine MacLeod

Site 33 -- Daniel Stewart, Michael J Stewart

Site 34 -- Joan Ingram, Fred Ingram, Ronald J Stewart

Site 35 -- John MacLellan, Jack MacLellan

Site 37 -- Floren

In [27]:
# looks correct

for supervisor in RELEVANT_SUPERVISORS.sort_values('NAME').NAME.unique():
    print(supervisor, '--', ', '.join(str(x) for x in list(RELEVANT_SUPERVISORS.query('NAME == @supervisor').site.unique())))

Alexander Gillis -- 56
Alexander MacDonald -- 39
Allan B Gillis -- 46
Anthony Cameron -- 5
Brian Doyle -- 18
Brian MacFarlane -- 66
Bruce MacLellan -- 51
Bruce McLellan -- 51
Catherine MacFarlane -- 52
Catherine MacLeod -- 32
Charles McDaniel -- 1, 4
Chris MacLean -- 68
Colin Gillis -- 8, 90
Daniel Stewart -- 33
Darlene Cameron -- 11
David MacKinnon -- 39
Donald D Gillis -- 43
Donald J Gillis -- 37
Donald M Campbell -- 45
Donald MacEachern -- 63
Donald MacLeod -- 29, 30
Donelda M Gillis -- 47
Eleanor MacLellan -- 7
Eleanor McDaniel -- 18, 91
Elizabeth MacKinnon -- 62
Elizabeth/Vincent MacKinnon -- 62
Finley MacDonald -- 62
Finley Stewart -- 44
Florence Gillis -- 37
Fred Ingram -- 34
Gerard MacFarlane -- 28
Gerard V Chiasson -- 26
Harold MacFarlane -- 67
Hugh J Gillis -- 38
Hughie MacDonnell -- 61
Irene MacIsaac -- 6
Jack MacLellan -- 35
James A Hirtle -- 24
James MacFarlane -- 55, 65
Jim Coady -- 21
Jim MacFarlane -- 62
Joan Ingram -- 34
John A Chisholm -- 9, 15, 17, 19
John A Coady --

# check more stuff

In [28]:
oldid = 2019050749
df_SD.query('id == @oldid', engine='python')

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES
14862,2,SWMARG,Mary E Gillis,,81,49,1,2019,5,7,2,150.0,68.0,7,upper,Gillis,,,,,2019-05-07,49,,,2019050749,,,,,,,,


In [29]:
dtypes = {
    'code': 'object',
    'no_nets': 'object',
    'YEAR': 'Int32',
    'MM': 'Int32',
    'DD': 'Int32',
    'Week': 'Int32',
#     'hours_fished': 'Int32'  # some entries say 'maximum '
}
df_SD_orig = pd.read_csv(r'.\provided data\Margaree Gaspereau logbooks_Master.csv', dtype=dtypes)

In [30]:
df_SD_orig.query('YEAR == 2019 and MM == 5 and DD == 7 and SITE_NO == "49" ', engine='python')

Unnamed: 0,DIST,RIVER,NAME,code,GEAR,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other
14890,2,SWMARG,Mary E Gillis,,81,49,1,2019,5,7,2,150.0,68.0,7,upper,Gillis,,,,


# Comparing Comments with Provided Data

In [31]:
dtypes_FD = dict(df_FD.dtypes)
dtypes_LF = dict(df_LF.dtypes)
dtypes_SD = dict(df_SD.dtypes)

del dtypes_FD['DATETIME']
del dtypes_LF['DATETIME']
del dtypes_SD['DATETIME']

df2FD = pd.read_csv(r'.\provided data\revised with comments\gaspereau_fish_details_AD.csv', dtype=dtypes_FD)
df2LF = pd.read_csv(r'.\provided data\revised with comments\gaspereau_length_frequencies_AD.csv', dtype=dtypes_LF)
df2SD = pd.read_csv(r'.\provided data\revised with comments\gaspereau_sample_data_AD.csv', dtype=dtypes_SD)

df2FD['DATETIME'] = pd.to_datetime(df2FD['DATETIME'])
df2LF['DATETIME'] = pd.to_datetime(df2LF['DATETIME'])
df2SD['DATETIME'] = pd.to_datetime(df2SD['DATETIME'])

In [32]:
# id are different because of recent disambiguation for ghost sampels:
# drop id columns when checking for changes
(
    (df_FD.id.astype(str).str[1:] == df2FD.id.astype(str).str[1:]).all(),
    (df_LF.id.astype(str).str[1:] == df2LF.id.astype(str).str[1:]).all(), 
    (df_SD.id.astype(str).str[1:] == df2SD.id.astype(str).str[1:]).all()
)

(False, True, False)

### Fish Details

In [33]:
# cast everything to a string to compare differences
df_FD.index = df_FD.index.rename('')

# drop new columns and id column
drop_FD = [x for x in list(df_FD.columns) if x not in list(df2FD.columns)]

FD_diff = (
    df_FD.drop(drop_FD+['id'], axis=1).astype(str).replace({'TRUE':'True','NaN':'','<NA>':'','nan':''})
    != df2FD.drop('id', axis=1).astype(str).replace({'TRUE':'True','NaN':'','<NA>':'','nan':''})
)

FD_diff_columns = list()
for i, row in pd.DataFrame(FD_diff.any()).reset_index().rename({'index':'field', 0:'differences'}, axis=1).iterrows():
    if row['differences']:
        FD_diff_columns.append(row['field'])

In [34]:
FD_differences = pd.concat([
    df2FD[FD_diff][FD_diff.any(axis=1)][FD_diff_columns],
    df_FD.drop(drop_FD, axis=1)[FD_diff][FD_diff.any(axis=1)][FD_diff_columns]
], axis=1)

In [35]:
FD_differences.astype(str).replace({'TRUE':'True','<NA>':'','nan':''})

Unnamed: 0,MM,DD,WEEK,SITE,PERIOD,CONDITION,FISH_NO,FL_WET,FL_FROZEN,FL_STD,WEIGHT,SPECIES,SEX,MATURITY,GONAD_WEIGHT,Ager_1,AGE_1,FSP_1,Comments_1,Ager_2,AGE_2,FSP_2,Comments_2,Ager_3,AGE_3,FSP_3,Comments_3,Envelop.Comments,DATETIME,AGE_notes_1,FSP_notes_1,AGE_notes_2,FSP_notes_2,AGE_notes_3,FSP_notes_3,SITE_notes,SITE1,SITE2,SITE3,remarks,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_SEX,FLAG_MATURITY,FLAG_FSP_1,FLAG_FL_STD,FLAG_FL_WET_FROZEN,FLAG_WEIGHT_OUTLIER,FLAG_GONAD_OUTLIER,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_LEN_WT_RATIO_OUTLIER,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE,MM.1,DD.1,WEEK.1,SITE.1,PERIOD.1,CONDITION.1,FISH_NO.1,FL_WET.1,FL_FROZEN.1,FL_STD.1,WEIGHT.1,SPECIES.1,SEX.1,MATURITY.1,GONAD_WEIGHT.1,Ager_1.1,AGE_1.1,FSP_1.1,Comments_1.1,Ager_2.1,AGE_2.1,FSP_2.1,Comments_2.1,Ager_3.1,AGE_3.1,FSP_3.1,Comments_3.1,Envelop.Comments.1,DATETIME.1,AGE_notes_1.1,FSP_notes_1.1,AGE_notes_2.1,FSP_notes_2.1,AGE_notes_3.1,FSP_notes_3.1,SITE_notes.1,SITE1.1,SITE2.1,SITE3.1,remarks.1,FLAG_SITE.1,FLAG_AM_PM_PERIOD.1,FLAG_SEX.1,FLAG_MATURITY.1,FLAG_FSP_1.1,FLAG_FL_STD.1,FLAG_FL_WET_FROZEN.1,FLAG_WEIGHT_OUTLIER.1,FLAG_GONAD_OUTLIER.1,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES.1,FLAG_LEN_WT_RATIO_OUTLIER.1,FLAG_AM_PM_PERIOD_DISCREPANCIES.1,FLAG_NO_MATCHING_SAMPLE.1
0,,12,,,,,,288,,288.0,363.0,,,,,,5,,,,,,,,,,,,1983-05-12,5,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 5; FSP_notes_1: 3,,,,,,,,,,,,,,,9,,,,,,285,,285.0,336.0,,,,,,4,,,,,,,,,,,,1983-05-09,4,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 3,,,,,,,,,,,,,
1,,12,,,,,10,251,,251.0,227.0,,,,,,3,,,,,,,,,,,,1983-05-12,3,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 3; FSP_notes_1: 3,,,,,,,,,,,,,,,9,,,,,2,288,,288.0,334.0,,,,,,5,,,,,,,,,,,,1983-05-09,5,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 5; FSP_notes_1: 3,,,,,,,,,,,,,
2,,12,,,,,11,247,,247.0,214.0,,M,,,,3,3,,,,,,,,,,,1983-05-12,3,3,,,,,,,,,SITE_notes: 12; AGE_notes_1: 3; FSP_notes_1: 3,,,,,,,,,,,,,,,9,,,,,3,280,,280.0,364.0,,F,,,,4,4,,,,,,,,,,,1983-05-09,4,4,,,,,,,,,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 4,,,,,,,,,,,,,
3,,12,,,,,12,287,,287.0,374.0,,,,,,5,4,,,,,,,,,,,1983-05-12,5,4,,,,,,,,,SITE_notes: 12; AGE_notes_1: 5; FSP_notes_1: 4,,,,,,,,,,,,,,,9,,,,,4,286,,286.0,353.0,,,,,,4,3,,,,,,,,,,,1983-05-09,4,3,,,,,,,,,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 3,,,,,,,,,,,,,
4,,12,,,,,13,264,,264.0,243.0,,M,,,,4,,,,,,,,,,,,1983-05-12,4,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 4; FSP_notes_1: 4,,,,,,,,,,,,,,,9,,,,,5,297,,297.0,370.0,,F,,,,6,,,,,,,,,,,,1983-05-09,6,,,,,,,,,,SITE_notes: 12; AGE_notes_1: 6; FSP_notes_1: 4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36786,,,,25,PM,,32,220,,,135.7,,M,,,,,,,,4,4,small for age,,,,,,NaT,,,4,4,,,25,25,,,SITE_notes: 25; Ager_2: LF; Comments_2: small for age; AGE_notes_2: 4; FSP_notes_2: 4,,,,,,,,,,,,,,,,,Eric Mac,AM,,33,206,,,103.2,,F,,13.1,,,,,,3,3,,,,,,,NaT,,,3,3,,,Eric Mac,92,,,SITE_notes: Eric Mac; Ager_2: LF; AGE_notes_2: 3; FSP_notes_2: 3,True,,,,,,,,,,,,
36787,,,,25,PM,,33,242,,,188.1,A,M,,,,,,,,3,3,,,,,,,NaT,,,3,3,,,25,25,,,SITE_notes: 25; Ager_2: LF; AGE_notes_2: 3; FSP_notes_2: 3,,,,,,,,,,,,,,,,,Eric Mac,AM,,34,234,,,164.9,B,F,,23.3,,,,,,4,4,,,,,,,NaT,,,4,4,,,Eric Mac,92,,,SITE_notes: Eric Mac; Ager_2: LF; AGE_notes_2: 4; FSP_notes_2: 4,True,,,,,,,,,,,,
36788,,,,25,PM,,34,251,,,184.6,A,,,25.1,,,,,,6,,one scale is cross-contaminated from another fish,,,,,,NaT,,,6,,,,25,25,,,SITE_notes: 25; Ager_2: LF; Comments_2: one scale is cross-contaminated from another fish; AGE_notes_2: 6; FSP_notes_2: 3,,,,,,,,,,,,,,,,,Eric Mac,AM,,35,241,,,164.7,B,,,18.2,,,,,,4,,,,,,,,NaT,,,4,,,,Eric Mac,92,,,SITE_notes: Eric Mac; Ager_2: LF; AGE_notes_2: 4; FSP_notes_2: 3,True,,,,,,,,,,,,
36789,,,,25,PM,,35,228,,,124.7,,M,,,,,,,,,,,,,,,,NaT,,,,,,,25,25,,,SITE_notes: 25; Ager_2: LF; AGE_notes_2: 3; FSP_notes_2: 3,,,,,,,,,,,,,,,,,Eric Mac,AM,,36,240,,,201.4,,F,,25.0,,,,,,,,Sootie,,,,,,NaT,,,,,,,Eric Mac,92,,,SITE_notes: Eric Mac; Ager_2: LF; Comments_2: Sootie; AGE_notes_2: 3; FSP_notes_2: 3,True,,,,,,,,,,,,


### Length Frequencies

In [36]:
LF_diff = (
    df_LF.drop('id', axis=1).astype(str).replace({'TRUE':'True','NaN':'','<NA>':'','nan':''})
    != df2LF.drop('id', axis=1).astype(str).replace({'TRUE':'True','NaN':'','<NA>':'','nan':''})
)

LF_diff_columns = list()
for i, row in pd.DataFrame(LF_diff.any()).reset_index().rename({'index':'field', 0:'differences'}, axis=1).iterrows():
    if row['differences']:
        LF_diff_columns.append(row['field'])

In [37]:
LF_differences = pd.concat([
    df2LF[LF_diff][LF_diff.any(axis=1)][LF_diff_columns],
    df_LF[LF_diff][LF_diff.any(axis=1)][LF_diff_columns]
], axis=1)

In [38]:
# Perfect. Already got this one.
LF_differences.astype(str).replace({'TRUE':'True','<NA>':'','nan':''})

Unnamed: 0,lgth,Flbin,DATETIME,length_bin_id,lgth.1,Flbin.1,DATETIME.1,length_bin_id.1
0,,,1990-05-07 00:00:00,,,,1990-05-07,
1,,,1990-05-07 00:00:00,,,,1990-05-07,
2,,,1990-05-07 00:00:00,,,,1990-05-07,
3,,,1990-05-07 00:00:00,,,,1990-05-07,
4,,,1990-05-07 00:00:00,,,,1990-05-07,
...,...,...,...,...,...,...,...,...
11435,,,2019-06-28 00:00:00,,,,2019-06-28,
11436,,,2019-06-28 00:00:00,,,,2019-06-28,
11437,,,2019-06-28 00:00:00,,,,2019-06-28,
11438,,,2019-06-28 00:00:00,,,,2019-06-28,


### Samples

In [39]:
[x for x in list(df2SD.columns) if x not in list(df_SD.columns)]

['period']

In [40]:
[x for x in list(df_SD.columns) if x not in list(df2SD.columns)]

['AM_PM_PERIOD']

In [41]:
# slightly better merge with newer version (AM_PM_PERIOD)
# => drop these columns
pd.concat([
    df_SD['AM_PM_PERIOD'].fillna(0), 
    df2SD['period'].fillna(0)], 
    axis=1
)[(df_SD['AM_PM_PERIOD'].fillna(0) != df2SD['period'].fillna(0))]

Unnamed: 0,AM_PM_PERIOD,period
3050,AM,0
3063,PM,0
3072,AM,0
3080,AM,0
3102,AM,0
...,...,...
15187,0,AM
15188,0,AM
15198,0,PM
15202,0,AM


In [42]:
# cast everything to a string to compare differences
df_SD.index = df_SD.index.rename('')

# drop new columns and id column
drop_SD = [x for x in list(df_SD.columns) if x not in list(df2SD.columns)]

SD_diff = (
    df_SD.drop(['id', 'AM_PM_PERIOD'], axis=1).astype(str).replace({'TRUE':'True','NaN':'','<NA>':'','nan':''})
    != df2SD.drop(['id', 'period'], axis=1).astype(str).replace({'TRUE':'True','NaN':'','<NA>':'','nan':''})
)

SD_diff_columns = list()
for i, row in pd.DataFrame(SD_diff.any()).reset_index().rename({'index':'field', 0:'differences'}, axis=1).iterrows():
    if row['differences']:
        SD_diff_columns.append(row['field'])

In [43]:
SD_differences = pd.concat([
    df2SD[SD_diff][SD_diff.any(axis=1)][SD_diff_columns],
    df_SD[SD_diff][SD_diff.any(axis=1)][SD_diff_columns]
], axis=1)

In [44]:
# nothing new here
SD_differences.astype(str).replace({'TRUE':'True','<NA>':'','nan':''})

Unnamed: 0,NAME,SITE_NO,no_nets,YEAR,MM,DD,Week,catch_lbs,catch_kg,hours_fished,zone,last_name,comments,bycatch_sbass,bycatch_shad,bycatch_other,DATETIME,SITE1,SITE2,remarks,total_fish_preserved,total_fish_measured,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES,NAME.1,SITE_NO.1,no_nets.1,YEAR.1,MM.1,DD.1,Week.1,catch_lbs.1,catch_kg.1,hours_fished.1,zone.1,last_name.1,comments.1,bycatch_sbass.1,bycatch_shad.1,bycatch_other.1,DATETIME.1,SITE1.1,SITE2.1,remarks.1,total_fish_preserved.1,total_fish_measured.1,wt_lbs.1,FLAG_DATETIME.1,FLAG_HOURS_FISHED.1,FLAG_SITE.1,FLAG_AM_PM_PERIOD_DISCREPANCIES.1
0,Darlene Cameron,11,,,5,17,,18038.0,8181.9,14,lower,Cameron,,,,,1983-05-17,11.0,,,,,,,,,,Michael J Stewart,33,,,4,18,,0.0,0.0,10,upper,Stewart,,,,,1983-04-18,33.0,,,,,,,,,
1,Martin E Cameron,12,,,5,16,,18038.0,8181.9,15,lower,Cameron,,,,,1983-05-16,12.0,,,100.0,,,,,,,Michael J Stewart,33,,,4,19,,0.0,0.0,9,upper,Stewart,,,,,1983-04-19,33.0,,,,,,,,,
2,Martin E Cameron,12,,,5,11,,17036.0,7727.4,14,lower,Cameron,,,,,1983-05-11,12.0,,,,,,,,,,Michael J Stewart,33,,,4,20,,0.0,0.0,12,upper,Stewart,,,,,1983-04-20,33.0,,,,,,,,,
3,Martin E Cameron,12,,,5,18,,15032.0,6818.4,15,lower,Cameron,,,,,1983-05-18,12.0,,,,,,,,,,Michael J Stewart,33,,,4,21,,301.0,136.5,14,upper,Stewart,,,,,1983-04-21,33.0,,,,,,,,,
4,Martin E Cameron,12,,,5,17,,13027.0,5908.9,15,lower,Cameron,,,,,1983-05-17,12.0,,,50.0,,,,,,,Michael J Stewart,33,,,4,22,,301.0,136.5,10,upper,Stewart,,,,,1983-04-22,33.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15208,Elizabeth MacKinnon,62,1,2019,6,29,9,50.0,22.7,6,,MacKinnon,,,,,2019-06-29,62.0,,,,,,,,,,Donald J Gillis,37,,1988,4,,,1002.0,454.5,14,,Gillis,,,,,NaT,37.0,,,,,,True,,,
15209,Pierre Chiasson,26,1,2019,6,29,9,2000.0,907.2,7,lower,Chiasson,,,,,2019-06-29,26.0,,,,,,,,,,Donald J Gillis,37,,1988,4,,,1002.0,454.5,14,upper,Gillis,,,,,NaT,37.0,,,,,,True,,,
15210,Bruce MacLellan,51,1,2019,6,29,9,100.0,45.4,10,,MacLellan,,,,,2019-06-29,51.0,,,,,,,,,,Donald J Gillis,37,,1988,4,,,601.0,272.6,14,,Gillis,,,,,NaT,37.0,,,,,,True,,,
15211,Robert Peters,25,1,2019,6,29,9,275.0,124.7,5,lower,Peters,,,,,2019-06-29,25.0,,,,,,,,,,Donald J Gillis,37,,1988,4,,,0.0,0.0,14,upper,Gillis,,,,,NaT,37.0,,,,,,True,,,


In [45]:
SD_diff.any()

DIST                               False
RIVER                              False
NAME                                True
code                               False
GEAR                               False
SITE_NO                             True
no_nets                             True
YEAR                                True
MM                                  True
DD                                  True
Week                                True
catch_lbs                           True
catch_kg                            True
hours_fished                        True
zone                                True
last_name                           True
comments                            True
bycatch_sbass                       True
bycatch_shad                        True
bycatch_other                       True
DATETIME                            True
SITE1                               True
SITE2                               True
remarks                             True
total_fish_prese

# COMMENT:

    CONVERT E.MACFARLANE, Eric Mac, Eric MacFarlane, J McFarlane to John Eric MacFarlane.

    JA Coady keep as is

    Multiple sites, keep as is. 

    I see 118 samples from 1989 that are associated with ‘blank’. Can you convert ‘blank’ to Unknown and upload.

In [46]:
df_SD_cleaned = pd.read_pickle('df_SD_cleaned.pickle')

In [47]:
list(df_FD[df_FD.SITE.isna()].id.unique())

[1989060100, 1989060200, 1989060700]

In [48]:
# Yes, these are imported as ghost samples
df_SD_cleaned[df_SD_cleaned.id.isin(list(df_FD[df_FD.SITE.isna()].id.unique()))].remarks

15243    SITE_notes: nan; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15244    SITE_notes: nan; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
15246    SITE_notes: nan; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies
Name: remarks, dtype: object

### let's make a list of all slightly dubious matches (inexact sites)

In [49]:
# FD ambiguous site names
FD_sites_id = df_FD[(~df_FD['SITE'].fillna(0).astype(str).str.isnumeric()) & (df_FD.SITE!='1A') & (df_FD.SITE!='1B')].id.unique()

# here are our troublesome sites
df_FD[df_FD.id.isin(FD_sites_id)].groupby('id').first()[['DATETIME', 'SITE', 'SITE1']]

Unnamed: 0_level_0,DATETIME,SITE,SITE1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1993052060,1993-05-20,"60, 52",60
1993052301,1993-05-23,"1, 8",1
1993052437,1993-05-24,373835,37
1993052660,1993-05-26,6052,60
1993052701,1993-05-27,18,1
1993060305,1993-06-03,58,5
1994051012,1994-05-10,1217,12
1994051115,1994-05-11,1517,15
1994051133,1994-05-11,334849,33
1994051411,1994-05-14,1112,11


In [50]:
# LF ambiguous site names
LF_sites_id = df_LF[(~df_LF['site'].fillna(0).astype(str).str.isnumeric()) & (df_LF.site!='1A') & (df_LF.site!='1B')].id.unique()
df_LF[df_LF.id.isin(LF_sites_id)].groupby('id').first()[['DATETIME', 'site', 'SITE1']]

# list of the distinct sites that are ambiguous
# df_LF[df_LF.id.isin(LF_sites_id)].groupby('id').first()[['DATETIME', 'site', 'SITE1']].replace({'Jimmy MacFarlane':'Eric McFarlane', 93:92}).groupby(['site', 'SITE1']).count().reset_index().drop('DATETIME', axis=1).T

Unnamed: 0_level_0,DATETIME,site,SITE1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1991060129,1991-06-01,2930,29
1993051117,1993-05-11,1733,17
1993052301,1993-05-23,18,1
1993052435,1993-05-24,353738,35
1993052701,1993-05-27,18,1
1993060305,1993-06-03,58,5
1993060352,1993-06-03,5260,52
1994051012,1994-05-10,1217,12
1994051115,1994-05-11,1517,15
1994051149,1994-05-11,4933,49


In [51]:
# SD
SD_sites_id = df_SD[(~df_SD['SITE_NO'].fillna(0).astype(str).str.isnumeric()) & (df_SD.SITE_NO!='1A') & (df_SD.SITE_NO!='1B')].id.unique()

In [52]:
# how many dubious sites are there?
dubious_sites_id = sorted([x for x in set(list(LF_sites_id)+list(FD_sites_id)+list(SD_sites_id))])
len(dubious_sites_id)

103

##### confirm correctly included in ghost samples (yes)

In [53]:
# have these all been made into ghost samples?
why_not_ghost_samples_ids = [x for x in dubious_sites_id if x not in df_SD_cleaned.id.unique()]
why_not_ghost_samples_ids

[4014053047]

In [54]:
# no ambiguous LF missing from ghost samples
df_LF[df_LF.id.isin(why_not_ghost_samples_ids)]

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME,SITE1,SITE2,SITE3,id,length_bin_id,FLAG_SITE,FLAG_AM_PM_PERIOD,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_NO_MATCHING_SAMPLE


In [55]:
# one FD that may be missing, check disambiguated id (+20k)
df_FD[df_FD.id.isin(why_not_ghost_samples_ids)][['id', 'FLAG_SITE', 'FLAG_MULTIPLE_SAMPLE_POSSIBILITIES', 'FLAG_MISNUMBERED_FISH_DETAILS', 'FLAG_NO_MATCHING_SAMPLE']].groupby('id').first()

Unnamed: 0_level_0,FLAG_SITE,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_MISNUMBERED_FISH_DETAILS,FLAG_NO_MATCHING_SAMPLE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4014053047,True,True,,


In [56]:
# 4014053047 would be in df_SD_cleaned as 4014253047 to assure matching with LF could not occur accidentally
# therefore, all ambiguous FD and LF are included in ghost sample data correctly
df_SD_cleaned[df_SD_cleaned.id.isin([x+200000 for x in df_FD[df_FD.id.isin(why_not_ghost_samples_ids)].id.unique()])].loc[:, 'DATETIME':]

Unnamed: 0,DATETIME,SITE1,SITE2,remarks,id,total_fish_preserved,total_fish_measured,AM_PM_PERIOD,wt_lbs,FLAG_DATETIME,FLAG_HOURS_FISHED,FLAG_SITE,FLAG_AM_PM_PERIOD_DISCREPANCIES,FLAG_GHOST_SAMPLE
15384,2014-05-30,47,,"SITE_notes: 47 or 62; Ager_1: JM; Ager_2: LF; Envelop.Comments: parasites; round worms, less than 20.; AGE_notes_1: 4; FSP_notes_1: 4; AGE_notes_2: 4; FSP_notes_2: 4; GHOST SAMPLE: created to match with unmatched Fish Details and/or Length Frequencies",4014253047,30.0,,PM,,,,,,True


##### confirm df_SD properly notes ambiguity / potential false positives

In [57]:
# check df_SD for dubious site name matches (unambiguous, if SITE1 is correct)
    # NOTES
    # the only ambiguous site for samples is 1A,8
    # half of these are ghost samples
    # the other half are unflagged, should potentially be flagged for followup (ambiguous type?)
# df_SD[df_SD.id.isin(dubious_sites_id)]

In [58]:
# merge all df together with inconsistent sites
# see if the inconsistencies are dubious or obvious

dubious_matches_SD_LF_FD = pd.merge(
    pd.merge(
        df_SD[df_SD.id.isin(dubious_sites_id)][['DATETIME', 'id', 'SITE_NO', 'FLAG_SITE']].rename({'FLAG_SITE':'FLAG_SITE_SD'}, axis=1),
        df_FD[df_FD.id.isin(dubious_sites_id)][[
            'id', 'SITE', 'PERIOD', 'FLAG_SITE', 'FLAG_MULTIPLE_SAMPLE_POSSIBILITIES', 'FLAG_NO_MATCHING_SAMPLE'
        ]].rename({'FLAG_SITE':'FLAG_SITE_FD', 'FLAG_NO_MATCHING_SAMPLE':'FLAG_NO_MATCHING_SAMPLE_FD'}, axis=1).groupby('id').first().reset_index(),
        on='id',
        how='left'  # only care about 3 way merge that include samples
    ).sort_values('id'),
    df_LF[df_LF.id.isin(dubious_sites_id)][[
        'id', 'site', 'period', 'FLAG_SITE', 'FLAG_NO_MATCHING_SAMPLE'
    ]].rename({'FLAG_SITE':'FLAG_SITE_LF', 'FLAG_NO_MATCHING_SAMPLE':'FLAG_NO_MATCHING_SAMPLE_LF'}, axis=1).groupby('id').first().reset_index(),
    on='id',
    how='left'
).sort_values('id')[[
    'DATETIME', 'id', 'SITE_NO', 'site', 'SITE', 'period', 'PERIOD',
    'FLAG_SITE_SD', 'FLAG_SITE_FD', 'FLAG_MULTIPLE_SAMPLE_POSSIBILITIES',
    'FLAG_NO_MATCHING_SAMPLE_FD', 'FLAG_SITE_LF', 'FLAG_NO_MATCHING_SAMPLE_LF'
]]

# dubious_matches_SD_LF_FD

In [59]:
# flag these and add a remark in SD about the differences in sites between tables
dubious_matches_SD_LF_FD['SITE_AMBIGUITIES'] = (
    'SITE AMBIGUITIES: Samples - ' + dubious_matches_SD_LF_FD['SITE_NO'].astype(str) 
     + '; Fish details - ' + dubious_matches_SD_LF_FD['SITE'].astype(str) 
     + '; Length frequencies - ' + dubious_matches_SD_LF_FD['site'].astype(str)
)

In [60]:
# no need to note period - these all match
dubious_matches_SD_LF_FD[(dubious_matches_SD_LF_FD.period != dubious_matches_SD_LF_FD.PERIOD) & dubious_matches_SD_LF_FD.PERIOD.notnull() & dubious_matches_SD_LF_FD.period.notnull()]

Unnamed: 0,DATETIME,id,SITE_NO,site,SITE,period,PERIOD,FLAG_SITE_SD,FLAG_SITE_FD,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_NO_MATCHING_SAMPLE_FD,FLAG_SITE_LF,FLAG_NO_MATCHING_SAMPLE_LF,SITE_AMBIGUITIES


In [61]:
dubious_matches_SD_LF_FD

Unnamed: 0,DATETIME,id,SITE_NO,site,SITE,period,PERIOD,FLAG_SITE_SD,FLAG_SITE_FD,FLAG_MULTIPLE_SAMPLE_POSSIBILITIES,FLAG_NO_MATCHING_SAMPLE_FD,FLAG_SITE_LF,FLAG_NO_MATCHING_SAMPLE_LF,SITE_AMBIGUITIES
0,1993-05-20,1993052060,60,60,"60, 52",AM,AM,,True,,,,,"SITE AMBIGUITIES: Samples - 60; Fish details - 60, 52; Length frequencies - 60"
1,1993-05-23,1993052301,1,18,"1, 8",PM,PM,,True,,,True,,"SITE AMBIGUITIES: Samples - 1; Fish details - 1, 8; Length frequencies - 1,8"
2,1993-05-24,1993052437,37,,373835,,AM,,True,,,,,"SITE AMBIGUITIES: Samples - 37; Fish details - 37,38,35; Length frequencies - nan"
3,1993-05-26,1993052660,60,60,6052,PM,PM,,True,,,,,"SITE AMBIGUITIES: Samples - 60; Fish details - 60,52; Length frequencies - 60"
4,1993-05-27,1993052701,1,18,18,PM,PM,,True,,,True,,"SITE AMBIGUITIES: Samples - 1; Fish details - 1,8; Length frequencies - 1,8"
5,1993-06-03,1993060305,5,58,58,AM,AM,,True,,,True,,"SITE AMBIGUITIES: Samples - 5; Fish details - 5,8; Length frequencies - 5,8"
6,1993-06-03,1993060352,52,5260,,PM,,,,,,True,,"SITE AMBIGUITIES: Samples - 52; Fish details - nan; Length frequencies - 52,60"
7,1994-05-10,1994051012,12,1217,1217,PM,PM,,True,,,True,,"SITE AMBIGUITIES: Samples - 12; Fish details - 12,17; Length frequencies - 12,17"
8,1994-05-14,1994051411,11,,1112,,AM,,True,,,,,"SITE AMBIGUITIES: Samples - 11; Fish details - 11,12; Length frequencies - nan"
9,1994-05-16,1994051605,5,,51117,,AM,,True,,,,,"SITE AMBIGUITIES: Samples - 5; Fish details - 5,11,17; Length frequencies - nan"


In [62]:
dubious_matches_SD_LF_FD = dubious_matches_SD_LF_FD[['id', 'SITE_AMBIGUITIES']]

##### now check ambiguous matches with LF and FD

In [63]:
# see if the inconsistencies are dubious or obvious

dubious_matches_LF_FD = pd.merge(
    df_FD[df_FD.id.isin(dubious_sites_id)][[
        'id', 'SITE', 'PERIOD', 'FLAG_SITE', 'FLAG_MULTIPLE_SAMPLE_POSSIBILITIES', 'FLAG_NO_MATCHING_SAMPLE'
    ]].rename({'FLAG_SITE':'FLAG_SITE_FD', 'FLAG_NO_MATCHING_SAMPLE':'FLAG_NO_MATCHING_SAMPLE_FD'}, axis=1).groupby('id').first().reset_index(),
    df_LF[df_LF.id.isin(dubious_sites_id)][[
        'id', 'site', 'period', 'FLAG_SITE', 'FLAG_NO_MATCHING_SAMPLE'
    ]].rename({'FLAG_SITE':'FLAG_SITE_LF', 'FLAG_NO_MATCHING_SAMPLE':'FLAG_NO_MATCHING_SAMPLE_LF'}, axis=1).groupby('id').first().reset_index(),
    on='id',
    how='inner'
).sort_values('id')[['id', 'site', 'SITE', 'period', 'PERIOD']]

dubious_matches_LF_FD

Unnamed: 0,id,site,SITE,period,PERIOD
0,1993052060,60,"60, 52",AM,AM
1,1993052301,18,"1, 8",PM,PM
2,1993052660,60,6052,PM,PM
3,1993052701,18,18,PM,PM
4,1993060305,58,58,AM,AM
5,1994051012,1217,1217,PM,PM
6,1994051115,1517,1517,PM,PM
7,1994051737,3760,3760,PM,PM
8,1994052538,3852,3852,PM,PM
9,1994052617,175,17,AM,AM


In [64]:
dubious_matches_LF_FD_id = [1993052060, 1993052660, 1994052617, 1995052902]

# now remove duplicates from SD
dubious_matches_LF_FD = dubious_matches_LF_FD[dubious_matches_LF_FD.id.isin(
    [x for x in dubious_matches_LF_FD_id if x not in list(dubious_matches_SD_LF_FD.id)]
)].copy()

In [65]:
dubious_matches_LF_FD['SITE_AMBIGUITIES'] = (
    'SITE AMBIGUITIES: Fish details - ' + dubious_matches_LF_FD['SITE'].astype(str) 
     + '; Length frequencies - ' + dubious_matches_LF_FD['site'].astype(str)
)

In [66]:
dubious_matches_LF_FD = dubious_matches_LF_FD[['id', 'SITE_AMBIGUITIES']]

# SAVE NOTES TO ADD BEFORE IMPORT

In [67]:
df_ambiguous_match_notes = pd.concat([
    dubious_matches_SD_LF_FD,
    dubious_matches_LF_FD
]).sort_values('id').reset_index(drop=True)

In [68]:
if save_ambiguous_notes_pickle := True:
    df_ambiguous_match_notes.to_pickle('df_ambiguous_match_notes.pickle')

In [69]:
# none of these will end up being disambiguated by 
df_ambiguous_match_notes.id.max()

2003051625

In [70]:
df_ambiguous_match_notes.id

0     1993052060
1     1993052301
2     1993052437
3     1993052660
4     1993052701
5     1993060305
6     1993060352
7     1994051012
8     1994051411
9     1994051605
10    1994051617
11    1994051648
12    1994051737
13    1994052352
14    1994052360
15    1994052617
16    1994052726
17    1994052741
18    1994052801
19    1994053001
20    1994053101
21    1994053160
22    1995052190
23    1995052205
24    1995052290
25    1995052390
26    1995052490
27    1995052590
28    1995052690
29    1995052790
30    1995052890
31    1995052902
32    1995052990
33    1995053090
34    1995053190
35    1995060190
36    1995060290
37    1995060390
38    1995060490
39    1995060590
40    1995060690
41    1995060790
42    1995060990
43    1995061090
44    1995061149
45    1995061190
46    1995061290
47    1995061390
48    1996052212
49    1997052205
50    2001060615
51    2002050915
52    2002051715
53    2002052905
54    2003051625
Name: id, dtype: Int64