In [435]:
from __future__ import print_function
from __future__ import division
from collections import Counter
import datetime as dt
import hdbscan
import logging
import matplotlib.pyplot as plt
import matplotlib.animation as mpl_animation
import numpy as np
import pandas as pd
from skimage import color
from IPython.display import HTML
from fleet_clustering import bq
from fleet_clustering import filters
from fleet_clustering import distances
from fleet_clustering import animation

## Fleet Coherence Time

One thing this current implementation doesn't take into account is 
the coherence time of a fleet. A vessel might be part of one fleet 
this season, but move to another fleet the next season. A way to
deal with this is to group fleets over shorter time periods (6 months
for instance) and then match fleets across groupings by seeing what
previous fleets have the largest overlap with the current set of
fleets.

In [169]:
ais_by_date = bq.load_ais_by_date('drifting_longlines', dt.date(2017, 1, 1), dt.date(2017, 12, 31),
                                 fishing_only=False, min_km_from_shore=10)

2017-01-01

    SELECT ssvid,
           year,
           month,
           day,
           lon,
           lat,
           iscarrier
    FROM (
        SELECT a.ssvid, 
               EXTRACT(YEAR FROM timestamp) year,
               EXTRACT(MONTH FROM timestamp) month,
               EXTRACT(DAY FROM timestamp) day,
               a.lon AS lon,
               a.lat AS lat,
               ROW_NUMBER() OVER(PARTITION BY a.ssvid,  TIMESTAMP_TRUNC(a.timestamp, DAY)
                                 ORDER BY ABS(TIMESTAMP_DIFF(a.timestamp , 
                                              TIMESTAMP_TRUNC(a.timestamp, DAY), SECOND) - 12 * 60 * 60 ) ASC) AS rk,
               c.iscarriervessel AND c.confidence = 3 AS iscarrier
        FROM 
        `world-fishing-827.pipe_production_b.messages_scored_*` a
            JOIN
        `world-fishing-827.gfw_research.vessel_info_allyears_20181002` b
            ON a.ssvid = CAST(b.mmsi AS STRING)
            JOIN 
        `world-fishing-827.vessel_d

In [172]:
pruned_by_date = {k : filters.remove_chinese_coast(v) for (k, v) in ais_by_date.items()}

In [173]:
valid_ssvid = sorted(filters.find_valid_ssvid(pruned_by_date))
C = distances.create_composite_lonlat_array(pruned_by_date, valid_ssvid)

In [174]:
dists = distances.compute_distances(C, clip=1000)
clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=12)
clusterer.fit(dists)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(cachedir=None),
    metric='precomputed', min_cluster_size=12, min_samples=None, p=None,
    prediction_data=False)

In [175]:
imp.reload(distances)
dists = distances.compute_distances_2(C, days=90, clip=1000)

In [176]:
imp.reload(distances)
dists_3 = distances.compute_distances_3(C, days=180, clip=1000)

In [191]:
clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=12,
                            min_samples=12)
clusterer.fit(dists)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(cachedir=None),
    metric='precomputed', min_cluster_size=12, min_samples=12, p=None,
    prediction_data=False)

In [192]:
imp.reload(animation)
anim = animation.make_anim(valid_ssvid, clusterer.labels_, pruned_by_date, interval=5)
HTML(anim.to_html5_video())

In [179]:
anim = animation.make_anim(valid_ssvid, clusterer.labels_, pruned_by_date, interval=1)
Writer = mpl_animation.writers['ffmpeg']
writer = Writer(fps=10, metadata=dict(artist='Me'), bitrate=1800)
anim.save('fleet_longliners.mp4', writer=writer)

In [134]:
pruned_by_date['20170101'].iscarrier.values

ZeroDivisionError: division by zero

In [None]:
distsbydate = distances.compute_distances_by_date(C)
distsbydate.shape

In [None]:
import numpy as np
composite = []
dates = sorted(pruned_by_date)
n = 6
m = 90
sz = len(distsbydate)
for i in range(n):
    print(i)
    j0 = i * m
    j1 = (i + 2) * m
    d2 = distsbydate[:, :, j0:j1] ** 2
    d = np.sqrt(np.nanmean(d2s, axis=2))
    expanded = np.zeros([sz, n * sz])
    expanded[:, sz * i: sz * (i + 1)] = d
    composite.append(expanded)
composite = np.concatenate(composite, axis=0)

In [None]:
clusterercomp = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=8,
                            min_samples=2)
clusterercomp.fit(composite)

In [None]:
anim = animation.make_anim(valid_ssvid, clusterer.labels_, pruned_by_date, interval=5)
HTML(anim.to_html5_video())

In [193]:
import imp
imp.reload(animation)
imp.reload(distances)
dir(distances)

['__builtins__',
 '__doc__',
 '__file__',
 '__name__',
 '__package__',
 'compute_distances',
 'compute_distances_',
 'compute_distances_2',
 'compute_distances_3',
 'compute_distances_by_date',
 'create_composite_lonlat_array',
 'create_lonlat_array',
 'infclip',
 'np']

In [194]:
import imp
imp.reload(bq)
carriers_by_date = bq.load_carriers_by_year(2017, 2018)

In [195]:
pruned_carriers_by_date = {k : filters.remove_chinese_coast(v) for (k, v) in carriers_by_date.items()}

In [198]:
query = """
               SELECT CAST(mmsi AS STRING) FROM
               `world-fishing-827.vessel_database.all_vessels_20190102`
               WHERE  iscarriervessel AND confidence = 3
        """
valid_carrier_ssvid_df = pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
valid_carrier_ssvid = valid_carrier_ssvid_df.f0_
valid_carrier_ssvid_set = set(valid_carrier_ssvid)

In [200]:
imp.reload(bq)
all_ais_by_date = bq.load_ais_by_date('drifting_longlines', dt.date(2017, 1, 1), dt.date(2017, 12, 31),
                                 fishing_only=False, min_km_from_shore=-1, include_carriers=True)

2017-01-01

    SELECT ssvid,
           year,
           month,
           day,
           lon,
           lat,
           iscarrier
    FROM (
        SELECT a.ssvid, 
               EXTRACT(YEAR FROM timestamp) year,
               EXTRACT(MONTH FROM timestamp) month,
               EXTRACT(DAY FROM timestamp) day,
               a.lon AS lon,
               a.lat AS lat,
               ROW_NUMBER() OVER(PARTITION BY a.ssvid,  TIMESTAMP_TRUNC(a.timestamp, DAY)
                                 ORDER BY ABS(TIMESTAMP_DIFF(a.timestamp , 
                                              TIMESTAMP_TRUNC(a.timestamp, DAY), SECOND) - 12 * 60 * 60 ) ASC) AS rk,
               c.iscarriervessel AND c.confidence = 3 AS iscarrier
        FROM 
        `world-fishing-827.pipe_production_b.messages_scored_*` a
            JOIN
        `world-fishing-827.gfw_research.vessel_info_allyears_20181002` b
            ON a.ssvid = CAST(b.mmsi AS STRING)
            JOIN 
        `world-fishing-827.vessel_d

In [201]:
all_pruned_by_date = {k : filters.remove_chinese_coast(v) for (k, v) in all_ais_by_date.items()}

In [202]:
# joint_ssvid = sorted(set(valid_ssvid) | set(valid_carrier_ssvid))
# joint_ssvid = sorted(set(valid_ssvid) | set(['412440493']))
# Cc = distances.create_composite_lonlat_array(joint_by_date, valid_ssvid)

In [203]:
# dists_c = distances.compute_distances(Cc, clip=1000)

In [217]:
clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=8,
                            min_samples=8)
clusterer_c.fit(dists)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(cachedir=None),
    metric='precomputed', min_cluster_size=20, min_samples=None, p=None,
    prediction_data=False)

In [205]:
imp.reload(bq)
encounters = bq.load_carriers(2017, 2017)

In [214]:
all_fleet_ssvid_set = set([s for (s, f) in zip(valid_ssvid, clusterer_c.labels_) if f >= 0])
valid_ssvid_set = set(valid_ssvid)
all_longline_reefer_ssvid_set = set()
for x in encounters.itertuples():
    if x.ssvid_1 in all_fleet_ssvid and x.ssvid_2 in valid_carrier_ssvid_set:
        all_longline_reefer_ssvid_set.add(x.ssvid_2)
    if x.ssvid_2 in all_fleet_ssvid and x.ssvid_1 in valid_carrier_ssvid_set:
        all_longline_reefer_ssvid_set.add(x.ssvid_1)
all_longline_reefer_ssvid = sorted(all_longline_reefer_ssvid_set)

In [218]:
imp.reload(animation)
valid_ssvid_set = set(valid_ssvid)
carrier_ids = [x for x in all_longline_reefer_ssvid if x not in valid_ssvid_set]
joint_ssvid = valid_ssvid + sorted(carrier_ids) 
labels = list(clusterer_c.labels_) + [max(clusterer_c.labels_) + 1] * len(carrier_ids) 
anim = animation.make_anim(joint_ssvid, labels, all_pruned_by_date, interval=5)
HTML(anim.to_html5_video())
# anim = animation.make_anim(joint_ssvid, labels, all_pruned_by_date, interval=1)
# Writer = mpl_animation.writers['ffmpeg']
# writer = Writer(fps=10, metadata=dict(artist='Me'), bitrate=1800)
# anim.save('pacific_longliners_w_carriers.mp4', writer=writer)

In [None]:
imp.reload(animation)
valid_ssvid_set = set(valid_ssvid)
carrier_ids = [x for x in all_longline_reefer_ssvid if x not in valid_ssvid_set]
joint_ssvid = valid_ssvid + sorted(carrier_ids) 
labels = list(clusterer_c.labels_) + [max(clusterer_c.labels_) + 1] * len(carrier_ids) 
anim = animation.make_anim(joint_ssvid, labels, all_pruned_by_date, interval=5)
HTML(anim.to_html5_video())
# anim = animation.make_anim(joint_ssvid, labels, all_pruned_by_date, interval=1)
# Writer = mpl_animation.writers['ffmpeg']
# writer = Writer(fps=10, metadata=dict(artist='Me'), bitrate=1800)
# anim.save('pacific_longliners_w_carriers.mp4', writer=writer)

In [400]:
imp.reload(distances)
# dists_2 = distances.compute_distances(C, clip=500, soft_clip=True)
dists_3 = distances.compute_distances_3(C, days=180, min_clip=1)

In [428]:
# clusterer = hdbscan.HDBSCAN(metric='precomputed', 
#                             min_cluster_size=20)
clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=11,
                           )
clusterer.fit(dists_3)
imp.reload(animation)
valid_ssvid_set = set(valid_ssvid)
carrier_ids = [x for x in all_longline_reefer_ssvid if x not in valid_ssvid_set]
joint_ssvid = valid_ssvid + sorted(carrier_ids) 
labels = list(clusterer.labels_) + [max(clusterer.labels_) + 1] * len(carrier_ids) 
anim = animation.make_anim(joint_ssvid, labels, all_pruned_by_date, interval=1, max_fleets=32)
# HTML(anim.to_html5_video())
Writer = mpl_animation.writers['ffmpeg']
writer = Writer(fps=10, metadata=dict(artist='Me'), bitrate=1800)
anim.save('longliner_fleets_w_carriers.mp4', writer=writer)

In [540]:
counts = []
skip = [1, 6, 7, 9, 10, 23, 25]
for i in range(max(labels) + 1):
    if i in skip:
        counts.append(0)
    else:
        counts.append((np.array(labels) == i).sum())
        
fleet_ids = [x for x in np.argsort(counts)[::-1] if counts[x] > 0]

print(len(fleet_ids))
fleets = {}
n_hues = (len(fleet_ids) + 3) // 4
for i, fid in enumerate(fleet_ids):
    sat = [0.5, 1][i % 2]
    val = 1
    hue = np.linspace(0, 1, n_hues)[(i // 4) % n_hues]
    [[clr]] = color.hsv2rgb([[(hue, sat, val)]])
    fg = [(0, 0, 0), clr][(i // 2) % 2]
    sz = [5, 4][(i // 2) % 2]
    fleets[fid] = ('o', tuple(fg), tuple(clr), sz, 1, str(i + 1))
fleets[max(labels)] = ('1', 'k', 'k', 8, 2, 'Carrier Vessel')
fleets

23


{0: ('o', (1.0, 0.5, 0.5), (1.0, 0.5, 0.5), 4, 1, '23'),
 2: ('o', (1.0, 0.0, 0.0), (1.0, 0.0, 0.0), 4, 1, '4'),
 3: ('o', (1.0, 0.5, 0.5), (1.0, 0.5, 0.5), 4, 1, '3'),
 4: ('o', (0, 0, 0), (1.0, 0.0, 0.0), 5, 1, '22'),
 5: ('o',
  (0.8999999999999999, 1.0, 0.5),
  (0.8999999999999999, 1.0, 0.5),
  4,
  1,
  '7'),
 8: ('o', (0, 0, 0), (1.0, 0.5, 0.5), 5, 1, '21'),
 11: ('o', (0, 0, 0), (0.8999999999999999, 1.0, 0.5), 5, 1, '5'),
 12: ('o',
  (0.0, 1.0, 0.40000000000000036),
  (0.0, 1.0, 0.40000000000000036),
  4,
  1,
  '12'),
 13: ('o',
  (0.8000000000000007, 0.0, 1.0),
  (0.8000000000000007, 0.0, 1.0),
  4,
  1,
  '20'),
 14: ('o', (0, 0, 0), (0.5, 0.6999999999999997, 1.0), 5, 1, '13'),
 15: ('o',
  (0.9000000000000004, 0.5, 1.0),
  (0.9000000000000004, 0.5, 1.0),
  4,
  1,
  '19'),
 16: ('o',
  (0.5, 1.0, 0.7000000000000002),
  (0.5, 1.0, 0.7000000000000002),
  4,
  1,
  '11'),
 17: ('o',
  (0.5, 0.6999999999999997, 1.0),
  (0.5, 0.6999999999999997, 1.0),
  4,
  1,
  '15'),
 18: ('o

In [542]:
imp.reload(animation)
anim = animation.make_anim(joint_ssvid, 
                           labels, 
                           all_pruned_by_date, 
#                           interval=10,
                            interval=1,
#                             fleets=fleets, 
                            show_ungrouped=True,
                            alpha=1,
                            legend_cols=8,
                            fleets=fleets,
                            ungrouped_legend="Ungrouped")
# HTML(anim.to_html5_video())
Writer = mpl_animation.writers['ffmpeg']
writer = Writer(fps=10, metadata=dict(artist='Me'), bitrate=1800)
anim.save('longliner_fleets_w_carriers_and_ungrouped.mp4', writer=writer)

In [530]:
query = """
SELECT code, iso3 FROM `world-fishing-827.gfw_research.country_codes`"""
country_codes_df = pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
iso3_map = {x.code : x.iso3 for x in country_codes_df.itertuples()}

In [487]:
for fid, v in fleets.items():
    _, _, _, label = v
    mask = (fid == np.array(labels))
    ssvids = np.array(joint_ssvid)[mask]
    mids = [x[:3] for x in ssvids]
    countries = [iso3_map.get(float(x), x) for x in mids]
    c = Counter(countries)
    print('Fleet'.format(fid), label, ':')
    for country, count in c.most_common():
        print('\t', country, ':', count)

Fleet 23 :
	 TWN : 7
	 ESP : 4
Fleet 4 :
	 JPN : 23
	 CHN : 22
	 TWN : 15
	 ESP : 4
	 302 : 1
	 CIV : 1
	 KOR : 1
	 CPV : 1
Fleet 3 :
	 USA : 48
	 CAN : 46
Fleet 22 :
	 ESP : 11
Fleet 7 :
	 ESP : 38
	 FRA : 2
	 ITA : 2
	 MLT : 2
	 MHL : 1
	 PRT : 1
Fleet 21 :
	 NZL : 13
	 ESP : 1
Fleet 5 :
	 USA : 64
Fleet 12 :
	 TWN : 20
	 FJI : 1
Fleet 20 :
	 CHN : 9
	 FSM : 4
	 TWN : 2
Fleet 13 :
	 JPN : 19
	 CHN : 1
Fleet 19 :
	 CHN : 15
Fleet 11 :
	 USA : 11
	 VUT : 8
	 TWN : 4
	 KIR : 1
	 COK : 1
Fleet 15 :
	 NCL : 16
Fleet 17 :
	 TWN : 10
	 CHN : 5
	 VUT : 1
Fleet 9 :
	 KOR : 38
	 TWN : 1
Fleet 14 :
	 ZAF : 20
Fleet 2 :
	 CHN : 67
	 FJI : 25
	 KOR : 14
	 452 : 2
	 600 : 1
	 AUS : 1
Fleet 1 :
	 CHN : 130
	 DEU : 5
	 415 : 4
	 KIR : 1
	 556 : 1
	 700 : 1
	 421 : 1
	 KOR : 1
	 PNG : 1
Fleet 8 :
	 CHN : 20
	 TWN : 9
	 MHL : 4
	 FSM : 4
	 JPN : 2
	 200 : 1
Fleet 18 :
	 TWN : 11
	 MYS : 5
Fleet 10 :
	 TWN : 31
	 SYC : 4
	 TZA : 1
	 CHN : 1
Fleet 16 :
	 REU : 13
	 MUS : 2
	 FRA : 1
Fleet Carrier Vessel