In [1]:
from __future__ import print_function
import tensorflow as tf
from classification.models import model
from classification import metadata
import numpy as np
from datetime import datetime
import pytz
from classification import run_inference
import json
import pandas
import matplotlib.pyplot as plt
import subprocess
import pandas as pd
%matplotlib inline

In [2]:
TYPE_ORDER = {x : i for (i, x) in 
    enumerate(['PORT_ENTRY',
               # The order of PORT_GAP is somewhat arbitrary, but it
               # Shouldn't matter as long as it occurs between ENTRY
               # and EXIT.
               'PORT_GAP',
               'PORT_STOP_BEGIN',
               'PORT_STOP_END',
               'PORT_EXIT'])}

TYPE_ORDER['PORT_GAP']

1

In [3]:
fishing_df = pandas.read_csv('classification/data/det_info_v20190520.csv')
ranges_df = pandas.read_csv('classification/data/det_ranges_v20190520.csv')

In [4]:
not_transit_only = set()
for x in ranges_df.itertuples():
    if x.is_fishing > 0.5:
        not_transit_only.add(x.id)
        
transit_only_mask = np.array([x not in not_transit_only for x in fishing_df.id])

In [5]:
has_duplicates = set()
for x in set(fishing_df.id):
    mask = (fishing_df.id == x)
    if mask.sum() > 1:
        has_duplicates.add(x)
has_duplicates = sorted(has_duplicates)

In [6]:
for x in has_duplicates[::10]:
    mask = (fishing_df.id == x)
    print(fishing_df[mask])
    print()

In [7]:
fishing_df.columns

Index([u'id', u'confidence', u'transit_only', u'length', u'tonnage',
       u'engine_power', u'label', u'crew_size', u'split'],
      dtype='object')

In [8]:
from classification.metrics import compute_fishing_metrics as cfm
coarse_mapping = {k : v for (k, v) in cfm.coarse_mapping}
train_mask = (fishing_df.split == 'Training')
for atm in coarse_mapping['seiners']:
    mask = (fishing_df.label == atm)
    print(atm, (mask & train_mask).sum(), (mask & ~train_mask).sum())
    print(atm, (mask & train_mask & ~transit_only_mask).sum(), 
               (mask & ~train_mask & ~transit_only_mask).sum())

other_seines 0 0
other_seines 0 0
tuna_purse_seines 19 7
tuna_purse_seines 7 6
other_purse_seines 2 1
other_purse_seines 1 1


In [9]:
'''
'''
''' 20
other_seines 0 0
other_seines 0 0
tuna_purse_seines 11 4
tuna_purse_seines 4 4
other_purse_seines 1 0
other_purse_seines 1 0
'''
''' 16
other_seines 0 0
other_seines 0 0
tuna_purse_seines 14 4
tuna_purse_seines 5 4
other_purse_seines 1 1
other_purse_seines 1 1
'''

' 16\nother_seines 0 0\nother_seines 0 0\ntuna_purse_seines 14 4\ntuna_purse_seines 5 4\nother_purse_seines 1 1\nother_purse_seines 1 1\n'

In [10]:
training_df = pandas.read_csv('classification/data/training_classes_vessel_id.csv')

IOError: [Errno 2] File classification/data/training_classes_vessel_id.csv does not exist: 'classification/data/training_classes_vessel_id.csv'

In [None]:
df_subset = training_df[['mmsi', 'label', 'length', 'tonnage', 'engine_power', 'crew_size', 'split']]
df_subset.to_csv('training_classes_vessel_id_subset.csv', index=False)
subprocess.check_call(['gsutil', 'cp', 'training_classes_vessel_id_subset.csv', 
                      'gs://machine-learning-dev-ttl-120d/training_classes_vessel_id_v20181025.csv'])

In [None]:
getattr

In [None]:
inference_table = 'world-fishing-827.machine_learning_dev_ttl_120d.smoke_test_vessel_inference_v20181024_20170701'
label_table = "world-fishing-827.machine_learning_dev_ttl_120d.training_classes_vessel_id_v20181025"
query = """
SELECT a.* FROM 
`{}` a
JOIN
`{}` b
ON a.vessel_id = b.mmsi 
LIMIT 100
""".format(inference_table, label_table)
inference_df = pd.read_gbq(query, project_id='world-fishing-827', dialect='standard')

In [None]:
label_df = pd.read_gbq("select * from `{}`".format(label_table), project_id='world-fishing-827', dialect='standard')

In [None]:
inference_df.head()

In [None]:
'''
python -m classification.metrics.compute_vessel_metrics \
     --inference-table  machine_learning_dev_ttl_120d.smoke_test_vessel_inference_v20181024_ \
     --label-table world-fishing-827.machine_learning_dev_ttl_120d.training_classes_vessel_id_v20181025 \
     --dest-path test_new_vessel_inference.html
'''

In [None]:
class MyModel(model.ModelBase): 
    def build_inference_net(self):
        pass
    def build_training_net(self):
        pass
    @property
    def max_window_duration_seconds(self):
        # A fixed-length rather than fixed-duration window.
        return 0

    @property
    def window_max_points(self):
        return 1024
    
    def make_input_fn(self, base_feature_path, split, num_parallel_reads):
        def training_input_fn():
            return (fishing_feature_generation.input_fn(
                        self.vessel_metadata,
                        self.build_training_file_list(base_feature_path, split),
                        self.num_feature_dimensions + 1,
                        self.max_window_duration_seconds,
                        self.window_max_points,
                        self.min_viable_timeslice_length,
                        select_ranges=self.use_ranges_for_training,
                        num_parallel_reads=num_parallel_reads)
                .prefetch(self.batch_size)
                .batch(self.batch_size)
                )
        return training_input_fn
    
    def make_training_input_fn(self, base_feature_path, num_parallel_reads):
        return self.make_input_fn(base_feature_path, utility.TRAINING_SPLIT, num_parallel_reads)
    
root_feature_path = "gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features"
fishing_range_file = "classification/data/combined_fishing_ranges_vessel_id.csv"
metadata_file = "classification/data/training_classes_vessel_id.csv"

fishing_ranges = metadata.read_fishing_ranges(fishing_range_file)
all_available_mmsis = metadata.find_available_mmsis(root_feature_path)

vessel_metadata = MyModel.read_metadata(
        all_available_mmsis, metadata_file,
        fishing_ranges, 1)

# mdl = MyModel(14, vessel_metadata)
# files = mdl.build_training_file_list(root_feature_path , utility.TRAINING_SPLIT)

In [None]:
sorted(vessel_metadata.metadata_by_mmsi)[:10]

In [None]:
import imp
import pytz
import classification.models.objectives
imp.reload(classification.models.objectives)
from classification.feature_generation import vessel_feature_generation
imp.reload(vessel_feature_generation)
from classification.feature_generation import fishing_feature_generation
imp.reload(fishing_feature_generation)
import classification.models.vessel_characterization
imp.reload(classification.models.vessel_characterization)
import classification.models.vessel_characterization
imp.reload(classification.models.vessel_characterization)
from classification.models.vessel_characterization import Model
import classification.models.fishing_detection
import classification.feature_generation.feature_utilities
imp.reload(classification.feature_generation.feature_utilities)
from classification.feature_generation.feature_utilities import EPOCH_DT
# imp.reload(classification.models.prod.fishing_detection)
# from classification.models.prod.fishing_detection import Model

mdl = Model(14, vessel_metadata, 'minimal')
    
time_ranges = [(1328083200, 1343635200), (1343808000, 1359360000), (1359705600, 1375257600), (1375344000, 1390896000), (1391241600, 1406793600), (1406880000, 1422432000), (1422777600, 1438329600), (1438416000, 1453968000), (1454313600, 1469865600), (1470038400, 1485590400), (1485936000, 1501488000), (1501574400, 1517126400)]
# time_ranges = [((datetime(2015, 6, 1, tzinfo=pytz.utc) - EPOCH_DT).total_seconds(), (datetime(2015, 7, 1, tzinfo=pytz.utc) - EPOCH_DT).total_seconds())]
template = "gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/{}.tfrecord"
vessel_ids = vessel_metadata.metadata_by_mmsi.keys()[:10]
paths = [template.format(x) for x in vessel_ids]

# paths = """
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/000007c49-9673-3128-434d-6937d3400dd3.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/000008a58-83b9-f7c6-e3af-148f962497f4.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/000020b39-9a62-1a09-60fe-d65a9e39c2cf.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/000027eb4-4bec-76cc-062b-6ce5cdcac685.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/000073b65-58c6-abce-7fb1-a842a589aa96.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/0000747d7-71d7-a018-9e87-d0fbab2adac6.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/00009b2f0-0822-cdfc-ba1e-a4b05787f7b6.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/00009f31c-ca68-b94d-11cc-f47ebd7390b8.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/0000a8ba8-8c4c-255c-59e6-de6c90f79862.tfrecord
# gs://machine-learning-dev-ttl-120d/features/ppb_features_through2017/features/0000b163b-b6ee-1b90-63d2-a2a1c6864e5a.tfrecord
# """.strip().split()

range_info = (datetime(2017,1,1), datetime(2017, 7, 1))
input_fn_p = mdl.make_prediction_input_fn(paths, time_ranges, 32)
iter_p = input_fn_p().make_one_shot_iterator()
el_p = iter_p.get_next()

input_fn_t = mdl.make_training_input_fn(root_feature_path, 32)
iter_t = input_fn_t().make_one_shot_iterator()
el_t = iter_t.get_next()

with tf.Session() as sess:
#     while True:
        x_p = sess.run(el_p)
        x_t = sess.run(el_t)
        print(x_p)
        print(x_t)


In [None]:
x_p['features'].shape
print(x_p['features'].mean(axis=(0, 1)))
print(x_p['features'].std(axis=(0, 1)))

In [None]:
print(x_t[0]['features'].mean(axis=(0, 1)))
print(x_t[0]['features'].std(axis=(0, 1)))

In [None]:
chkpt_path = 'gs://world-fishing-827-dev-ttl30d/data-production/classification/timothyhochberg/vessel_char_v20181023A/models/vessel_characterization'
estimator = mdl.make_estimator(chkpt_path)

In [None]:
for result in results:
    vid = vessel_metadata.mmsi_map_int2str[int(result['mmsi'])]
    cls = metadata.VESSEL_CLASS_DETAILED_NAMES[np.argmax(result['Vessel-class'])]
    print(vid[:5], 
          vessel_metadata.metadata_by_mmsi[vid][0]['label'], cls,
          vessel_metadata.metadata_by_mmsi[vid][0]['length'], np.exp(result['Vessel-length']))

In [None]:
# So running predictions "by hand" works. What about using an Inferer?

inferer = run_inference.Inferer(mdl, chkpt_path, root_feature_path)

all_results = []
for results in inferer.run_inference(vessel_ids, 6, datetime(2015,1,1, tzinfo=pytz.UTC), 
                                     datetime(2017, 12, 31, tzinfo=pytz.UTC)):
    all_results.append(results)

In [None]:
for result in all_results:
    vid = vessel_metadata.mmsi_map_int2str[int(result['mmsi'])]
    cls = result['Multiclass']['max_label']
    print(vid[:5], 
          vessel_metadata.metadata_by_mmsi[vid][0]['label'], cls,
          vessel_metadata.metadata_by_mmsi[vid][0]['length'], result['length']['value'])

In [None]:
cmd = r'''
docker-compose run vessel_inference  \
                --feature_path {}   \
                --checkpoint_path  {}    \
                --feature_dimensions 14   \
                --results_table=world-fishing-827:machine_learning_dev_ttl_120d.smoke_test_vessel_inference_v20181024_   \
                --start_date 2017-01-01   \
                --end_date 2017-12-31   \
                --project world-fishing-827   \
                --temp_location gs://machine-learning-dev-ttl-30d/scratch/inference   \
                --job_name smoke-test-vessel-inference   \
                --max_num_workers 100   \
                --setup_file ./setup.py   \
                --requirements_file requirements.txt   \
                --runner DataflowRunner  \
                --max_num_workers 100  \
                --worker_machine_type=custom-1-13312-ext
'''.format(root_feature_path, chkpt_path)
print(cmd)

In [None]:
test_vessel_ids = vessel_metadata.metadata_by_split['Test'].keys()
np.random.seed(888)
test_vessel_ids = np.random.choice(test_vessel_ids, 2000)

query = """
SELECT * FROM `world-fishing-827.machine_learning_dev_ttl_120d.smoke_test_vessel_inference_v20181024_*` 
WHERE _TABLE_SUFFIX >= "20170101" AND
vessel_id  in ({})
LIMIT 1000
""".format(', '.join(['"{}"'.format(x) for x in test_vessel_ids]))



results_df = pandas.read_gbq(query, project_id='world-fishing-827', dialect='standard')

results_df.head()

In [None]:
for result in results_df.itertuples():
    vid = result.vessel_id
    cls = result.max_label
    print(vid[:5], 
          vessel_metadata.metadata_by_mmsi[vid][0]['label'], cls,
          vessel_metadata.metadata_by_mmsi[vid][0]['length'], result.length)

In [None]:
keys = [x['label'] for x in results_df.iloc[0].label_scores]
x = []
y = []
for result in results_df.itertuples():
    y.append(keys.index(result.max_label))
    vid = result.vessel_id
    lbl = vessel_metadata.metadata_by_mmsi[vid][0]['label']
    x.append(keys.index(lbl))
    
plt.figure(figsize=(12, 12))
plt.plot(x, y, '.', markersize=20, alpha = 0.1)
# keys

In [None]:
x = []
y = []
for result in results_df.itertuples():
    vid = result.vessel_id
    length = vessel_metadata.metadata_by_mmsi[vid][0]['length']
    if length:
        y.append((result.length))
        vid = result.vessel_id
        length = vessel_metadata.metadata_by_mmsi[vid][0]['length']
        x.append(length)
    
plt.figure(figsize=(12, 12))
plt.plot(x, y, '.', markersize=5)
len(x)

In [None]:
input_fn = mdl.make_test_input_fn(root_feature_path, 32, prefetch=1)
dataset = input_fn()
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
    el = sess.run(el)

In [None]:
from classification.models.fishing_detection import Model
real_mdl = Model(14, vessel_metadata, 'minimal')

In [None]:
dataset = real_mdl.make_test_input_fn(root_feature_path, 4)()
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
    el = sess.run(el)
el[0][0].shape, el[1].shape

In [None]:
import pandas as pd
train_df = pd.read_csv('classification/data/training_classes_vessel_id.csv')
train_df.head()

In [None]:
mmsis = open('mmsis.txt').read().strip().split()

In [None]:
in_both = set(mmsis) & set(train_df.mmsi)
len(in_both)

In [None]:
sorted(mmsis)[:10]

In [None]:
sorted(set(train_df.mmsi))[:10]

In [None]:
mapper = pd.read_csv('train/ssvid_to_vessel_id.csv')

In [None]:
sorted(set(mapper.vessel_id))[:10]

In [None]:
def fakerator():
    while True:
        for i in range(3):
            yield i
        raise StopIteration
        
f = fakerator()

print('a')
for x in f:
    print x
    
print('b')
for x in f:
    print x

In [None]:
2 + 2