## Importing the LonelyBoy Library (github.com/insert-generic-name-here/lonelyboy)

In [4]:
import os, sys
sys.path.append(os.path.join(os.path.expanduser('~'), 'Documents/Insert-Generic-Name-Here/'))
# sys.path

In [5]:
from lonelyboy.geospatial import plots as gsplt
from lonelyboy.geospatial import preprocessing as gspp
from lonelyboy.timeseries import lbtimeseries as tspp
from lonelyboy.geospatial import group_patterns as gsgp

## Importing all other Essential Libraries
#### (DO NOT FORGET TO EXECUTE THE FUNCTIONS IN THE BOTTOM CELLS)

In [6]:
import psycopg2
import numpy as np
import configparser
import pandas as pd
import geopandas as gpd
import contextily as ctx
from random import choice
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, MeanShift
from sklearn.preprocessing import MinMaxScaler
from shapely.geometry import Point, LineString, shape
from haversine import haversine

In [7]:
from multiprocessing import cpu_count, Pool
from functools import partial
import datetime

## Import Libraries for Visualizations

In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
import PyQt5
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')
get_ipython().magic('matplotlib qt')

In [10]:
from tqdm import tqdm, tqdm_notebook

## Importing the Server Credentials 

In [11]:
properties = configparser.ConfigParser()
properties.read(os.path.join('.','sql_server.ini'))
properties = properties['SERVER']

['./sql_server.ini']

## Connectiing to Server and Fetch 48hrs of Trajectory Data

In [12]:
host    = properties['host']
db_name = properties['db_name']
uname   = properties['uname']
pw      = properties['pw']
port    = properties['port']

traj_sql = 'SELECT * FROM ais_data.dynamic_ships WHERE ts>1456802710 AND ts<1456975510  '
ports_sql = 'SELECT * FROM ports.ports_of_brittany'

con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port = port)

traj = gpd.GeoDataFrame.from_postgis(traj_sql, con, geom_col='geom' )

ports = gpd.GeoDataFrame.from_postgis(ports_sql, con, geom_col='geom' )
ports.geom = ports.geom.apply(lambda x: x[0])

print(f'Fetched {sizeof_fmt(traj.memory_usage().sum())}')
print(f'Fetched {sizeof_fmt(ports.memory_usage().sum())}')

con.close()

Fetched 38.1MiB
Fetched 14.0KiB


In [13]:
ports.head(2)
traj.head(2)

Unnamed: 0,gid,gml_id,por_id,libelle_po,insee_comm,por_x,por_y,geom
0,1,port.1,1,Le Vivier-sur-Mer,35361,297025.0,2408370.0,POINT (-1.771798868659233 48.60274269672541)
1,2,port.10,10,Saint-Samson sur Rance,22327,279335.0,2396060.0,POINT (-2.001990119062326 48.48369993456267)


Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom
0,17515086,227300000,7.0,-126.0,2.8,34.2,346,-4.631805,48.11133,1456802713,POINT (-4.631805 48.11133)
1,17515175,226263000,5.0,0.0,0.1,256.2,133,-4.485797,48.37982,1456803199,POINT (-4.4857965 48.37982)


## (Hopefully) Doing something Useful

* ### Select some mmsi's
* ### Denoise them (per mmsi)
* ### Resample them (per mmsi)

In [14]:
### SELECT SOME MMSI'S
# mmsis = np.array([228186700, 477115900, 227002330, 227270000, 227369960, 227298110,\
#                    228190600, 227408710, 228849000, 227730220, 228762000, 227612860,\
#                    227592820, 227590030, 227654220, 227578460, 220364000, 636092323,\
#                    227322690, 227702670, 228021700, 219118000, 227312180, 273348830,\
#                    275457000, 226084000, 244976000, 224130870, 228203800, 228167900,\
#                    227327000, 228144000, 636016457, 215477000, 226318000, 314207000,\
#                    247087700, 563187000, 477612300, 227588930, 258316000, 228919000])
mmsis = traj.mmsi.unique()

In [15]:
### INSTEAD OF THIS CELL, EXECUTE THE ONE BELOW
# mmsis = np.array([227590030, 305476000, 235005980, 226084000, 227088590])
# sample_trajectories = gspp.pick_random_group(traj, 'mmsi', group_size=50)
# len(sample_trajectories)
# sample_trajectories

In [16]:
### DENOISE THEM
#### DROP TIMESTAMP DUPLICATES PER MMSI
sample_trajectories = traj.loc[traj.mmsi.isin(mmsis)]
sample_trajectories = sample_trajectories.drop_duplicates(subset=['mmsi', 'ts']).sort_values('ts').reset_index(drop=True)
sample_trajectories.head()

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom
0,12293630,227941000,7.0,0.0,0.0,285.0,8,-4.327213,48.100086,1456802711,POINT (-4.3272133 48.100086)
1,12293631,227705102,15.0,-127.0,0.0,261.8,511,-4.496568,48.382435,1456802711,POINT (-4.496568 48.382435)
2,17515086,227300000,7.0,-126.0,2.8,34.2,346,-4.631805,48.11133,1456802713,POINT (-4.631805 48.11133)
3,12293632,227016100,0.0,0.0,0.0,264.3,174,-4.481568,48.381393,1456802713,POINT (-4.4815683 48.381393)
4,12293633,227008170,0.0,0.0,0.0,135.0,144,-4.486115,48.381565,1456802713,POINT (-4.486115 48.381565)


In [17]:
### DROP OUTLIERS IN SAMPLE_TRAJECTORIES BASED ON TIMESTAMP ---- WIP; IDK IF I'LL APPLY THIS OR NOT (SO FAR: NOT APPLIED)
for mmsi in tqdm_notebook(sample_trajectories.mmsi.unique()):
    mmsi_ts_outliers = gspp.get_outliers(sample_trajectories.loc[sample_trajectories.mmsi == mmsi].ts, alpha=1.5)
#     sample_trajectories = sample_trajectories.drop(mmsi_ts_outliers)
    print (mmsi_ts_outliers)

HBox(children=(IntProgress(value=0, max=197), HTML(value='')))

Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([117413, 117422, 117432, 117437, 117459, 117465, 117483, 117500,
            119040, 119057, 119075, 119512, 119534, 119967, 119977, 119983,
            120011, 120028, 120043, 120058, 120063, 129542, 174651, 179445,
            179467, 179505, 181825, 181858, 181882, 182175, 182185, 182205,
            182457, 183057, 183064, 183094, 183118, 183352, 183361, 183371,
            183382, 183412, 185721, 187276, 187284, 189800, 189813, 190085,
            190091, 192714, 192726, 192741, 192779, 192797, 193055, 193062,
            193080, 193088, 193344, 193395, 193691, 193697, 193717, 193726,
            193732, 193741, 193746, 193752, 194052, 194059, 194066, 194076,
            194083, 194092, 194103, 194105, 194118, 194422, 194439, 194444,
            194451, 195988, 195999, 196025, 196031, 196040, 196219, 196236,
            196242, 196271, 196277, 196503, 196512, 196557],
           dtype='int64')
Int64Index([], dt

Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([67654, 67832, 68148, 68165], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([73514, 73950, 74756, 75153, 75207, 75550, 75626], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([202100, 202111, 202896, 205949, 209366, 213062, 222190], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([84741, 84771, 86031, 95224, 95231, 95244, 99193, 99225], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Inde

In [18]:
### DROP OUTLIERS IN SAMPLE_TRAJECTORIES BASED ON VELOCITY (POTENTIAL-AREA-OF-ACTIVITY)
sample_trajectories['velocity'] = np.nan

for mmsi in tqdm_notebook(sample_trajectories.mmsi.unique()):
    try:
        sample_trajectories.loc[sample_trajectories.mmsi == mmsi] = gspp.calculate_velocity(sample_trajectories.loc[sample_trajectories.mmsi == mmsi], smoothing=True, window=5, center=True)
#         mmsi_vel_outliers = gspp.get_outliers(sample_trajectories.loc[sample_trajectories.mmsi == mmsi].velocity, alpha=3)
        mmsi_vel_outliers = sample_trajectories.iloc[(sample_trajectories.mmsi == mmsi) & (sample_trajectories.velocity >= 102.2)]
        print(mmsi_vel_outliers)
        sample_trajectories = sample_trajectories.drop(mmsi_vel_outliers)
    except:
        continue

sample_trajectories = sample_trajectories.reset_index(drop=True)
sample_trajectories = sample_trajectories.fillna(0)

HBox(children=(IntProgress(value=0, max=197), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  gdf['current_loc'] = gdf.geom.apply(lambda x: (x.x,x.y))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  gdf['next_loc'] = gdf.geom.shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  gdf['next_loc'] = gdf.next_loc.apply(lambda x : (x.x,x.y))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index




In [19]:
sample_trajectories

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom,velocity
0,12293630.0,227941000.0,7.0,0.0,0.0,285.0,8.0,-4.327213,48.100086,1.456803e+09,POINT (-4.3272133 48.100086),0.000000
1,12293631.0,227705102.0,15.0,-127.0,0.0,261.8,511.0,-4.496568,48.382435,1.456803e+09,POINT (-4.496568 48.382435),0.000000
2,17515086.0,227300000.0,7.0,-126.0,2.8,34.2,346.0,-4.631805,48.111330,1.456803e+09,POINT (-4.631805 48.11133),0.000000
3,12293632.0,227016100.0,0.0,0.0,0.0,264.3,174.0,-4.481568,48.381393,1.456803e+09,POINT (-4.4815683 48.381393),0.000000
4,12293633.0,227008170.0,0.0,0.0,0.0,135.0,144.0,-4.486115,48.381565,1.456803e+09,POINT (-4.486115 48.381565),0.000000
5,12293634.0,227574020.0,15.0,-127.0,0.0,241.7,511.0,-4.496673,48.382454,1.456803e+09,POINT (-4.496673 48.382454),0.000000
6,17515087.0,256494000.0,5.0,0.0,0.0,344.0,217.0,-4.451149,48.383625,1.456803e+09,POINT (-4.4511485 48.383625),0.000000
7,12293637.0,227006750.0,0.0,127.0,0.0,266.1,267.0,-4.484478,48.381172,1.456803e+09,POINT (-4.4844785 48.381172),0.000000
8,12293635.0,228186700.0,15.0,-127.0,102.3,360.0,511.0,-4.512498,48.370834,1.456803e+09,POINT (-4.5124984 48.370834),0.000000
9,12293636.0,228394000.0,7.0,-127.0,1.7,77.0,511.0,-4.654577,48.123035,1.456803e+09,POINT (-4.654577 48.123035),0.000000


In [None]:
plot_idx = 0
for mmsi in sample_trajectories.mmsi.unique():
    plt.figure(plot_idx)
    pois = gspp.get_outliers(sample_trajectories.loc[sample_trajectories.mmsi == mmsi].velocity, alpha=2)
    
    print (pois)
    sample_trajectories.loc[sample_trajectories.mmsi == mmsi].velocity.plot(figsize=(20,10), c=(0,0,0))
    for poi in pois:
        plt.axvline(x=poi, c='r')
    
    plt.show()
    plot_idx += 1

In [20]:
mmsi_resampled = []
for mmsi in tqdm_notebook(sample_trajectories.mmsi.unique()):
    tmp = gspp.resample_geospatial(sample_trajectories.loc[sample_trajectories.mmsi == mmsi], rule = '60S', method='linear', crs = {'init': 'epsg:4326'}, drop_lon_lat = True)
    tmp = tmp.drop(['id', 'status', 'turn', 'speed', 'ts'], axis=1)
    mmsi_resampled.append(tmp)
        
sample_trajectories_resampled = pd.concat(mmsi_resampled)
sample_trajectories_resampled = sample_trajectories_resampled.sort_values('datetime').reset_index(drop=True)

HBox(children=(IntProgress(value=0, max=198), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  sample_ves['datetime'] = pd.to_datetime(sample_ves['ts'], unit='s')





In [None]:
plot_idx = 0
for mmsi in sample_trajectories_resampled.mmsi.unique():
    plt.figure(plot_idx)
    sample_trajectories_resampled.loc[sample_trajectories_resampled.mmsi == mmsi].velocity.plot(figsize=(20,10), c=(0,0,0))
    plt.show()
    plot_idx += 1 

In [21]:
sample_trajectories_resampled.head(20)

Unnamed: 0,mmsi,course,heading,geom,velocity,datetime
0,0.0,0.0,0.0,POINT (0 0),0.0,1970-01-01 00:00:00
1,227941000.0,285.0,8.0,POINT (-4.3272133 48.100086),0.0,2016-03-01 03:25:00
2,227300000.0,34.2,346.0,POINT (-4.631805 48.11133),0.0,2016-03-01 03:25:00
3,227016100.0,264.3,174.0,POINT (-4.4815683 48.381393),0.0,2016-03-01 03:25:00
4,227008170.0,135.0,144.0,POINT (-4.486115 48.381565),0.0,2016-03-01 03:25:00
5,227574020.0,241.7,511.0,POINT (-4.496673 48.382454),0.0,2016-03-01 03:25:00
6,256494000.0,344.0,217.0,POINT (-4.4511485 48.383625),0.0,2016-03-01 03:25:00
7,227006750.0,266.1,267.0,POINT (-4.4844785 48.381172),0.0,2016-03-01 03:25:00
8,228186700.0,360.0,511.0,POINT (-4.5124984 48.370834),0.0,2016-03-01 03:25:00
9,228394000.0,77.0,511.0,POINT (-4.654577 48.123035),0.0,2016-03-01 03:25:00


## Plot the Preprocessed Trajectories (just to be sure)

In [21]:
ax = sample_trajectories_resampled.to_crs(epsg=3857).plot(figsize=(10, 10))
ctx.add_basemap(ax, zoom=11)
plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f787f1eb9b0>

## Search for Flocks
* #### 1. The Clustering will take place in time slices 
* #### 2. Possible Features: (X_coord, Y_coord, course)

* #### Set up a Color Map (for the Plots)

In [22]:
LABEL_COLOR_MAP = {-1 : 'black', 0 : 'white', 1 : 'r', 2 : 'g',\
                   3 : 'b', 4 : 'm', 5 : 'y', 6 : 'maroon', 7 : 'pink',\
                   8 : 'sienna', 9 : 'darkslategray', 10 : 'purple', 
                   11 : 'darkgoldenrod', 12: 'chocolate'}

* #### Getting a Sample Time Frame (to test the clustering algorithms)

In [23]:
# sample_datetime = np.datetime64('2016-03-02T10:04:00.000000000') 
# sample_datetime = np.datetime64('2016-03-01T19:13:00.000000000')
# THE BEST ONE SO FAR
sample_datetime = np.datetime64('2016-03-01T16:32:00.000000000')
## 
# while True:
#     sample_datetime = np.random.choice(sample_trajectories_resampled.datetime)
#     sample_timeFrame = sample_trajectories_resampled.loc[sample_trajectories_resampled.datetime == sample_datetime].drop_duplicates(subset=['mmsi', 'datetime'])
#     sample_timeFrame = sample_timeFrame.sort_values('datetime').reset_index(drop=True)
#     if (mean_distance_to_nearest_port(sample_timeFrame, ports) > 0.11): break
sample_timeFrame = sample_trajectories_resampled.loc[sample_trajectories_resampled.datetime == sample_datetime].drop_duplicates(subset=['mmsi', 'datetime'])
sample_timeFrame

Unnamed: 0,mmsi,course,heading,geom,velocity,datetime
51846,228211900.0,185.687052,511.000000,POINT (-4.778695697065673 48.36124802235678),5.795790,2016-03-01 16:32:00
51847,228064900.0,192.200000,206.666667,POINT (-4.511742766666667 48.364703),0.037871,2016-03-01 16:32:00
51848,477115900.0,182.000000,38.000000,POINT (-4.453365 48.38168866666667),0.019633,2016-03-01 16:32:00
51849,275457000.0,264.822857,239.714286,POINT (-5.522254757142857 48.22897728571429),6.800780,2016-03-01 16:32:00
51850,227612860.0,90.221221,511.000000,POINT (-4.495466119767442 48.38341180959303),2.319420,2016-03-01 16:32:00
51851,227696930.0,341.857460,511.000000,POINT (-4.484825078830645 48.38040994556452),4.604099,2016-03-01 16:32:00
51852,227006750.0,294.900000,254.000000,POINT (-4.4844966 48.381138),0.423957,2016-03-01 16:32:00
51853,227941000.0,174.649110,215.793594,POINT (-4.326646832740214 48.09851185053381),1.943659,2016-03-01 16:32:00
51854,228762000.0,223.400000,223.000000,POINT (-4.5063834 48.165085),11.182687,2016-03-01 16:32:00
51855,227114300.0,9.300000,511.000000,POINT (-4.8058414 48.31592),10.210144,2016-03-01 16:32:00


In [24]:
ax = sample_timeFrame.to_crs(epsg=3857).plot(figsize=(10, 10))
ctx.add_basemap(ax, zoom=11)
plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f6453764080>

* ## K-Means Clustering 

In [24]:
# gdf_tmp = sample_trajectories_resampled.loc[sample_trajectories_resampled.datetime == sample_datetime].drop_duplicates(subset=['mmsi', 'datetime'])  
flocks = gsgp.flock_mining(sample_timeFrame, doi=None, init='k-means++', n_init=10, n_jobs=-1, precompute_distances=True, random_state=0, verbose=0) 

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/66 [00:00<?, ?it/s][A
 12%|█▏        | 8/66 [00:00<00:00, 72.49it/s][A
 20%|█▉        | 13/66 [00:00<00:00, 60.13it/s][A
 26%|██▌       | 17/66 [00:00<00:01, 48.33it/s][A
 30%|███       | 20/66 [00:00<00:01, 39.67it/s][A
 35%|███▍      | 23/66 [00:00<00:01, 33.25it/s][A
 39%|███▉      | 26/66 [00:00<00:01, 28.71it/s][A
 44%|████▍     | 29/66 [00:00<00:01, 24.31it/s][A
 48%|████▊     | 32/66 [00:01<00:01, 21.71it/s][A
 53%|█████▎    | 35/66 [00:01<00:01, 19.40it/s][A
 56%|█████▌    | 37/66 [00:01<00:01, 17.17it/s][A
 59%|█████▉    | 39/66 [00:01<00:01, 15.61it/s][A
 62%|██████▏   | 41/66 [00:01<00:01, 14.82it/s][A
 65%|██████▌   | 43/66 [00:01<00:01, 13.78it/s][A
 68%|██████▊   | 45/66 [00:02<00:01, 13.37it/s][A
 71%|███████   | 47/66 [00:02<00:01, 12.97it/s][A
 74%|███████▍  | 49/66 [00:02<00:01, 12.53it/s][A
 77%|███████▋  | 51/66 [00:02<00:01, 12.12it/s][A
 80%|████████  | 53/66 [00:02<00:01, 11.80it/s][A
 83

In [25]:
flocks

Unnamed: 0,flocks,start_time,end_time
0.0,"[235108972.0, 227300000.0, 227366000.0, 228394...",2016-03-01 16:32:00,
1.0,"[228064900.0, 477115900.0, 227612860.0, 227696...",2016-03-01 16:32:00,
2.0,"[244925000.0, 311043200.0]",2016-03-01 16:32:00,
3.0,[224130870.0],2016-03-01 16:32:00,
4.0,"[228126000.0, 227318040.0, 228258000.0]",2016-03-01 16:32:00,
5.0,"[275457000.0, 228228800.0]",2016-03-01 16:32:00,
6.0,"[227941000.0, 227162950.0, 227114630.0, 227142...",2016-03-01 16:32:00,
7.0,"[228211900.0, 227114300.0, 226263000.0, 228210...",2016-03-01 16:32:00,
8.0,[304087000.0],2016-03-01 16:32:00,
9.0,"[228853000.0, 226216000.0]",2016-03-01 16:32:00,


* #### Plotting the Results of Flock Mining...

In [27]:
label_color = pd.DataFrame([], index=sample_timeFrame.index, columns=['color'])

cluster_indices = [sample_timeFrame.loc[sample_timeFrame.mmsi.isin(flock)].index for flock in flocks.flocks] 
for color_idx, cluster in enumerate(cluster_indices):
    label_color.loc[label_color.index.isin(cluster), 'color'] = color_idx

ax = sample_timeFrame.to_crs(epsg=3857).plot(figsize=(10, 10), c=label_color.color.values)
ctx.add_basemap(ax, zoom=11)
plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f645340e400>

### __Q:__ sklearn KMeans uses Euclidean Distance; Which is not Good on GeoSpatial Data. What if we could incorporate the Haversine Formula to get a bit more Accurate Clusters?

### __A:__ Re-project the Data to a Mercator in which Euclidean Distance can be Applied

## Search for Convoys (via DBSCAN Clustering; Using the Haversine Distance as a Distance Metric)

In [217]:
convoys = gsgp.convoy_mining(sample_timeFrames, time_threshold=5, min_samples=3, eps=2.5, metric=haversine, metric_params=None, algorithm='auto', leaf_size=50, p=None, n_jobs=-1) 

100%|██████████| 2/2 [00:00<00:00,  7.70it/s]


In [218]:
convoys

Unnamed: 0,convoys,start_time,end_time
0.0,"[228211900.0, 228336000.0, 226084000.0]",2016-03-01 16:33:00,
1.0,"[228064900.0, 477115900.0, 227612860.0, 227696...",2016-03-01 16:33:00,
2.0,"[227941000.0, 227162950.0, 227114630.0, 227142...",2016-03-01 16:33:00,
3.0,"[227300000.0, 227366000.0, 228394000.0]",2016-03-01 16:33:00,
0.0,"[227114630.0, 227941000.0, 227162950.0, 227142...",2016-03-01 16:33:00,
1.0,"[227222000.0, 227592820.0, 227008170.0, 228155...",2016-03-01 16:33:00,
2.0,"[227003050.0, 226084000.0, 228210800.0, 228336...",2016-03-01 16:33:00,
3.0,"[227300000.0, 228394000.0, 227366000.0]",2016-03-01 16:33:00,


* #### Plotting the Results of Convoy Mining...

In [30]:
label_color = pd.DataFrame([], index=sample_timeFrame.index, columns=['color'])

cluster_indices = [sample_timeFrame.loc[sample_timeFrame.mmsi.isin(flock)].index for flock in convoys.convoys] 
for color_idx, cluster in enumerate(cluster_indices):
    label_color.loc[label_color.index.isin(cluster), 'color'] = color_idx

ax = sample_timeFrame.to_crs(epsg=3857).plot(figsize=(10, 10), c=label_color.color.values)
ctx.add_basemap(ax, zoom=11)
plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f64533f06a0>

In [173]:
from datetime import datetime, timedelta

In [229]:
def convoy_mining_v2(gdf, time_threshold=5, min_samples=3, eps=2.5, metric=haversine, metric_params=None, algorithm='auto', leaf_size=50, p=None, n_jobs=-1): 
    gdf[['lon', 'lat']] = gdf['geom'].apply(lambda x: pd.Series({'lon':x.x, 'lat':x.y})) 
    gdf = gdf.drop('geom', axis=1)

    convoys = pd.DataFrame([], columns=['convoys', 'start_time', 'end_time'])
    
    for datetime_of_interest in tqdm(gdf['datetime'].unique()):      
        # Get the Useful Features
        timeFrame = gdf[['lon', 'lat']].loc[gdf['datetime'] == datetime_of_interest]
        # Normalize
        scaler = MinMaxScaler()
        X = scaler.fit_transform(timeFrame.values)
        # Cluster
        clustering = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, metric_params=metric_params,\
                            algorithm=algorithm, leaf_size=leaf_size, p=p, n_jobs=n_jobs).fit(X)
        cluster_n = clustering.labels_
        # Create the DataFrame (Structure: <INDEX_OF_CLUSTER>, <LIST_OF_TIMEFRAME_INDICES>)
        tmp = pd.DataFrame(np.array([gdf.loc[timeFrame.index]['mmsi'], cluster_n]).T, columns=['convoys', 'cnv_idx'])
        tmp = tmp.loc[tmp.cnv_idx != -1].groupby('cnv_idx')['convoys'].apply(list)
        
        # Append to Convoy History
        convoys = convoys.append(pd.DataFrame({'convoys':tmp, 'start_time':np.array([datetime_of_interest]*len(tmp))}, columns=['convoys', 'start_time', 'end_time'])).reset_index(drop=True)  
        
    return convoys

In [230]:
def convoy_mining_v3(gdf, time_threshold=5, min_samples=3, eps=2.5, rate=60, metric=haversine, metric_params=None, algorithm='auto', leaf_size=50, p=None, n_jobs=-1): 
    gdf[['lon', 'lat']] = gdf['geom'].apply(lambda x: pd.Series({'lon':x.x, 'lat':x.y})) 
    gdf = gdf.drop('geom', axis=1)

    convoys = pd.DataFrame([], columns=['convoys', 'start_time', 'end_time'])
    
    for datetime_of_interest in tqdm(gdf['datetime'].unique()):      
        # Get the Useful Features
        timeFrame = gdf[['lon', 'lat']].loc[gdf['datetime'] == datetime_of_interest]
        # Normalize
        scaler = MinMaxScaler()
        X = scaler.fit_transform(timeFrame.values)
        # Cluster
        clustering = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, metric_params=metric_params,\
                            algorithm=algorithm, leaf_size=leaf_size, p=p, n_jobs=n_jobs).fit(X)
        cluster_n = clustering.labels_
        # Create the DataFrame (Structure: <INDEX_OF_CLUSTER>, <LIST_OF_TIMEFRAME_INDICES>)
        tmp = pd.DataFrame(np.array([gdf.loc[timeFrame.index]['mmsi'], cluster_n]).T, columns=['convoys', 'cnv_idx'])
        tmp = tmp.loc[tmp.cnv_idx != -1].groupby('cnv_idx')['convoys'].apply(list).apply(set)
        tmp = pd.DataFrame({'convoys':tmp, 'start_time':np.array([datetime_of_interest]*len(tmp))}, columns=['convoys', 'start_time', 'end_time'])
        
        # Append to Convoy History
        convoys = join_convoys(convoys, tmp)
        
    return convoys

In [33]:
sample_datetimes = [np.datetime64('2016-03-01T16:32:00.000000000'), np.datetime64('2016-03-01T16:33:00.000000000')]

sample_timeFrames = sample_trajectories_resampled.loc[sample_trajectories_resampled.datetime.isin(sample_datetimes)].drop_duplicates(subset=['mmsi', 'datetime'])
sample_timeFrames

Unnamed: 0,mmsi,course,heading,geom,velocity,datetime
51846,228211900.0,185.687052,511.000000,POINT (-4.778695697065673 48.36124802235678),5.795790,2016-03-01 16:32:00
51847,228064900.0,192.200000,206.666667,POINT (-4.511742766666667 48.364703),0.037871,2016-03-01 16:32:00
51848,477115900.0,182.000000,38.000000,POINT (-4.453365 48.38168866666667),0.019633,2016-03-01 16:32:00
51849,275457000.0,264.822857,239.714286,POINT (-5.522254757142857 48.22897728571429),6.800780,2016-03-01 16:32:00
51850,227612860.0,90.221221,511.000000,POINT (-4.495466119767442 48.38341180959303),2.319420,2016-03-01 16:32:00
51851,227696930.0,341.857460,511.000000,POINT (-4.484825078830645 48.38040994556452),4.604099,2016-03-01 16:32:00
51852,227006750.0,294.900000,254.000000,POINT (-4.4844966 48.381138),0.423957,2016-03-01 16:32:00
51853,227941000.0,174.649110,215.793594,POINT (-4.326646832740214 48.09851185053381),1.943659,2016-03-01 16:32:00
51854,228762000.0,223.400000,223.000000,POINT (-4.5063834 48.165085),11.182687,2016-03-01 16:32:00
51855,227114300.0,9.300000,511.000000,POINT (-4.8058414 48.31592),10.210144,2016-03-01 16:32:00


In [246]:
# convoys = convoy_mining_v2(sample_timeFrames, time_threshold=5, min_samples=3, eps=2, metric=haversine, metric_params=None, algorithm='auto', leaf_size=50, p=None, n_jobs=-1) 
convoys = convoy_mining_v3(sample_timeFrames, time_threshold=5, rate=60, min_samples=3, eps=2, metric=haversine, metric_params=None, algorithm='auto', leaf_size=50, p=None, n_jobs=-1) 

100%|██████████| 2/2 [00:00<00:00,  7.00it/s]


In [248]:
convoys

Unnamed: 0,convoys,start_time,end_time
0,"{227941000.0, 227162950.0, 227142200.0, 227114...",2016-03-01 16:32:00,
1,"{228064900.0, 227574020.0, 412069000.0, 227635...",2016-03-01 16:32:00,
2,"{228336000.0, 228211900.0, 226084000.0}",2016-03-01 16:32:00,
3,"{227941000.0, 227142200.0, 227114630.0, 227162...",2016-03-01 16:32:00,2016-03-01 16:33:00
4,"{227300000.0, 228394000.0, 227366000.0}",2016-03-01 16:33:00,NaT


In [236]:
# convoys_t1 = convoys.loc[convoys.start_time == sample_datetimes[0]]
# convoys_t1.convoys = convoys_t1.convoys.apply(set)
# convoys_t2 = convoys.loc[convoys.start_time == sample_datetimes[1]]
# convoys_t2.convoys = convoys_t2.convoys.apply(set)

# convoys_t1
# convoys_t2
# for idxA, convoyA in enumerate(convoys_t1.convoys.apply(set)):
#     for idxB, convoyB in enumerate(convoys_t2.convoys.apply(set)):
#         if (convoyA & convoyB):
#             print (f'from {idxA} to {idxB}')

# convoys_t1.join(convoys_t2, (convoys_t2.convoys.intersections(convoys_t1.convoys)), "inner")

In [245]:
def join_convoys(convoys_prev, convoys_next, rate=60):
    df_joined = pd.DataFrame([], columns=['convoys', 'start_time', 'end_time'])
    df_left, df_right = (convoys_prev, convoys_next) if (len(convoys_prev) > len(convoys_next)) else (convoys_next, convoys_prev)

    try:
        for idx_left, convoy_left in enumerate(df_left.convoys):
            start_time = df_left.iloc[idx_left].start_time
            foundFlag = False
            end_time = df_right.iloc[0].start_time
            for idx_right, convoy_right in enumerate(df_right.convoys):
                end_time = df_right.iloc[idx_right].start_time
                if convoy_left.intersection(convoy_right) != set():
                    df_joined = df_joined.append({'convoys':convoy_left.intersection(convoy_right), 'start_time':start_time if start_time < end_time else end_time}, ignore_index=True)
                    foundFlag = True
                    break
                    
            if not foundFlag:
                df_left_isOlder = start_time < end_time
                if df_left_isOlder:
                    res_join = pd.DataFrame([{'convoys':convoy_left, 'start_time':start_time, 'end_time': end_time},
                                             {'convoys':df_right.iloc[-1].convoys, 'start_time':start_time}], columns=['convoys', 'start_time', 'end_time'])
                else:
                    res_join = pd.DataFrame([{'convoys':df_right.iloc[-1].convoys, 'start_time':end_time, 'end_time': start_time},
                                             {'convoys':convoy_left, 'start_time':start_time}], columns=['convoys', 'start_time', 'end_time'])
                df_joined = df_joined.append(res_join)
    except IndexError: 
        if len(df_left) == 0: 
            return df_joined
        df_left.loc[df_left.end_time == np.nan].end_time = df_left.loc[df_left.end_time == np.nan].start_time + timedelta(seconds=60)
        df_joined = df_joined.append(df_left)

    return df_joined.reset_index(drop=True)

In [249]:
for doi in convoys.start_time.unique():
    label_color = pd.DataFrame([], index=sample_timeFrames.loc[sample_timeFrames.datetime == doi].index, columns=['color'])
    
    cluster_indices = [sample_timeFrames.loc[sample_timeFrames.mmsi.isin(convoy)].index for convoy in convoys.loc[convoys.start_time == doi].convoys] 
    for color_idx, cluster in enumerate(cluster_indices):
        label_color.loc[label_color.index.isin(cluster), 'color'] = color_idx
    
    ax = sample_timeFrames.loc[sample_timeFrames.datetime == doi].to_crs(epsg=3857).plot(figsize=(10, 10), c=label_color.color.values)
    ctx.add_basemap(ax, zoom=11)
    plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f4a46212198>

<matplotlib.axes._subplots.AxesSubplot at 0x7f4a45f8d0f0>

# FUNCTIONS

In [1]:
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [2]:
def make_lines(gdf, df_out, i, geometry = 'geometry'):
    geom0 = gdf.loc[i][geometry]
    geom1 = gdf.loc[i + 1][geometry]
    
    start, end = [(geom0.x, geom0.y), (geom1.x, geom1.y)]
    line = LineString([start, end])
    
    # Create a DataFrame to hold record
    data = {'id': i,
            'geometry': [line]}
    df_line = pd.DataFrame(data, columns = ['id', 'geometry'])
    
    # Add record DataFrame of compiled records
    df_out = pd.concat([df_out, df_line])
    return df_out

In [3]:
def mean_distance_to_nearest_port(gdf, ports):
    '''
    Calculates the minimum distance between the point and the lists of ports. Can be used to determine if the ship is sailing or not
    '''
    counter = 0
    for point in tqdm_notebook(gdf.geom):
        counter += ports.geom.distance(point).min()
    
    return counter/len(gdf)