## Importing the LonelyBoy Library (github.com/insert-generic-name-here/lonelyboy)

In [4]:
import os, sys
sys.path.append(os.path.join(os.path.expanduser('~'), 'Documents/Insert-Generic-Name-Here/'))
# sys.path

from lonelyboy.geospatial import plots as gsplt
from lonelyboy.geospatial import preprocessing as gspp
from lonelyboy.timeseries import lbtimeseries as tspp
from lonelyboy.geospatial import group_patterns as gsgp

## Importing all other Essential Libraries
#### (DO NOT FORGET TO EXECUTE THE FUNCTIONS IN THE BOTTOM CELLS)

In [30]:
import psycopg2
import numpy as np
import configparser
import pandas as pd
import geopandas as gpd
import contextily as ctx
from random import choice
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.base import clone
from sklearn.cluster import DBSCAN, KMeans, MeanShift
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from shapely.geometry import Point, LineString, shape
from haversine import haversine
from datetime import datetime, timedelta

In [6]:
from multiprocessing import cpu_count, Pool
from functools import partial
import datetime

## Import Libraries for Visualizations

In [7]:
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

import PyQt5
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')
get_ipython().magic('matplotlib qt')

##  Importing the Server Credentials & Connectiing to Server and Fetch 48hrs of Trajectory Data

In [8]:
properties = configparser.ConfigParser()
properties.read(os.path.join('.','sql_server.ini'))
properties = properties['SERVER']

host    = properties['host']
db_name = properties['db_name']
uname   = properties['uname']
pw      = properties['pw']
port    = properties['port']

traj_sql = 'SELECT * FROM ais_data.dynamic_ships WHERE ts>1456802710 AND ts<1456975510'
ports_sql = 'SELECT * FROM ports.ports_of_brittany'

con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port = port)

traj = gpd.GeoDataFrame.from_postgis(traj_sql, con, geom_col='geom' )

ports = gpd.GeoDataFrame.from_postgis(ports_sql, con, geom_col='geom' )
ports.geom = ports.geom.apply(lambda x: x[0])

print(f'Fetched {sizeof_fmt(traj.memory_usage().sum())}')
print(f'Fetched {sizeof_fmt(ports.memory_usage().sum())}')

con.close()

Fetched 38.1MiB
Fetched 14.0KiB


In [9]:
ports.head(2)
traj.head(2)

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom
0,17515114,227300000,7.0,-126.0,1.1,36.4,309,-4.631512,48.11188,1456802793,POINT (-4.6315117 48.11188)
1,17515086,227300000,7.0,-126.0,2.8,34.2,346,-4.631805,48.11133,1456802713,POINT (-4.631805 48.11133)


## Let's get dangerous (got the reference?)

* ### Step 1: Denoising

In [10]:
#### DROP TIMESTAMP DUPLICATES PER MMSI
sample_trajectories = traj
sample_trajectories = sample_trajectories.drop_duplicates(subset=['mmsi', 'ts']).sort_values('ts').reset_index(drop=True)
sample_trajectories.head()

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom
0,12293630,227941000,7.0,0.0,0.0,285.0,8,-4.327213,48.100086,1456802711,POINT (-4.3272133 48.100086)
1,12293631,227705102,15.0,-127.0,0.0,261.8,511,-4.496568,48.382435,1456802711,POINT (-4.496568 48.382435)
2,12293634,227574020,15.0,-127.0,0.0,241.7,511,-4.496673,48.382454,1456802713,POINT (-4.496673 48.382454)
3,17515086,227300000,7.0,-126.0,2.8,34.2,346,-4.631805,48.11133,1456802713,POINT (-4.631805 48.11133)
4,17515087,256494000,5.0,0.0,0.0,344.0,217,-4.451149,48.383625,1456802713,POINT (-4.4511485 48.383625)


In [11]:
### DROP OUTLIERS IN SAMPLE_TRAJECTORIES BASED ON TIMESTAMP ---- IDK IF I'LL APPLY THIS OR NOT (SO FAR: NOT APPLIED)
# indices = [item for sublist in [x for x in gdf.groupby(['mmsi'])['ts'].apply(lambda x: get_outliers(x)[0]) if x != []] for item in sublist]
# sample_trajectories.drop(indices, inplace=True)

In [12]:
### CALCULATE VELOCITIES BASED ON THE POINTS
sample_trajectories['velocity'] = np.nan
sample_trajectories = sample_trajectories.groupby(['mmsi'], as_index=False).apply(gspp.calculate_velocity, smoothing=False, window=15, center=False).reset_index(drop=True)

### DENOISE SAMPLE_TRAJECTORIES BASED ON A VELOCITY THRESHOLD (POTENTIAL-AREA-OF-ACTIVITY)
sample_trajectories = gspp.PotentialAreaOfActivity(sample_trajectories, velocity_threshold = 102.2)
sample_trajectories

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom,velocity
0,17552952,212228000,0.0,-126.0,5.5,37.0,17,-5.628232,48.721220,1456930088,POINT (-5.6282315 48.72122),5.500000
1,17516776,212518000,0.0,0.0,10.6,155.2,155,-5.398723,48.202198,1456810995,POINT (-5.398723 48.202198),11.876852
2,17517323,212518000,0.0,127.0,10.9,153.6,158,-5.283112,48.040985,1456814595,POINT (-5.2831116 48.040985),0.000000
3,12372258,215477000,0.0,0.0,11.3,119.3,124,-5.098250,48.289906,1456865914,POINT (-5.09825 48.289906),15.360736
4,12372320,215477000,0.0,-126.0,11.0,125.0,122,-5.095583,48.288727,1456865955,POINT (-5.0955834 48.288727),15.424732
5,12372433,215477000,0.0,0.0,11.6,123.1,125,-5.090328,48.286488,1456866035,POINT (-5.090328 48.286488),15.458734
6,12372554,215477000,0.0,0.0,11.4,120.0,126,-5.084340,48.283928,1456866126,POINT (-5.08434 48.283928),15.371931
7,12372583,215477000,0.0,-126.0,11.6,126.2,124,-5.083053,48.283320,1456866146,POINT (-5.083053 48.28332),15.701477
8,17533416,215477000,0.0,-126.0,11.2,123.2,112,-5.077209,48.280720,1456866234,POINT (-5.0772085 48.28072),15.609761
9,17533420,215477000,0.0,-126.0,11.0,115.3,111,-5.076662,48.280533,1456866242,POINT (-5.0766616 48.280533),15.824664


In [13]:
# ### RESAMPLE TRAJECTORIES
sample_trajectories_resampled = sample_trajectories.groupby(['mmsi'], as_index=False).apply(gspp.resample_geospatial, rule = '60S', method='linear', crs = {'init': 'epsg:4326'}, drop_lon_lat = True).reset_index(drop=True) 
sample_trajectories_resampled.sort_values(['datetime'], ascending=True, inplace=True)

## Plot the Preprocessed Trajectories (just to be sure)

In [None]:
ax = sample_trajectories_resampled.to_crs(epsg=3857).plot(figsize=(10, 10))
ctx.add_basemap(ax, zoom=11)
plt.show()

## Flock/Convoy Mining Algorithm Testing will take place in 2h of data

In [14]:
timeWindow = sample_trajectories_resampled.datetime.unique()[0]
timeWindow = [timeWindow + np.timedelta64(60*i, 's') for i in range(1, 121)] #2H
# timeWindow = [timeWindow + np.timedelta64(60*i, 's') for i in range(1, 61)] #1H
# timeWindow = [timeWindow + np.timedelta64(60*i, 's') for i in range(1, 31)] #30MIN
sample_timeFrame = sample_trajectories_resampled.loc[sample_trajectories_resampled.datetime.isin(timeWindow)].sort_values('datetime').reset_index(drop=True)
sample_timeFrame

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,ts,geom,velocity,datetime
0,1.257696e+07,227006750.0,0.0,0.000000,0.000000,268.500000,268.000000,1.456803e+09,POINT (-4.484462 48.381145),0.307190,2016-03-01 03:26:00
1,1.751510e+07,227300000.0,7.0,-126.000000,1.500000,21.200000,322.000000,1.456803e+09,POINT (-4.6316385 48.11174),1.395731,2016-03-01 03:26:00
2,1.754235e+07,256494000.0,5.0,0.000000,0.000000,344.000000,217.000000,1.456803e+09,POINT (-4.451143 48.383625),0.019396,2016-03-01 03:26:00
3,1.257696e+07,227705102.0,15.0,-127.000000,0.000000,282.600000,511.000000,1.456803e+09,POINT (-4.496595 48.38244),0.056880,2016-03-01 03:26:00
4,1.754237e+07,220417000.0,1.0,0.000000,0.066667,170.400000,242.333333,1.456803e+09,POINT (-4.484997766666667 48.35269333333333),0.072627,2016-03-01 03:26:00
5,1.257697e+07,227574020.0,15.0,-127.000000,0.000000,237.500000,511.000000,1.456803e+09,POINT (-4.496685 48.382427),0.186221,2016-03-01 03:26:00
6,1.257696e+07,228186700.0,15.0,-127.000000,102.300000,360.000000,511.000000,1.456803e+09,POINT (-4.5125766 48.370872),0.059529,2016-03-01 03:26:00
7,1.257696e+07,227008170.0,0.0,0.000000,0.000000,135.000000,143.000000,1.456803e+09,POINT (-4.486115 48.38155),0.000000,2016-03-01 03:26:00
8,1.257696e+07,228394000.0,7.0,-127.000000,0.600000,77.800000,511.000000,1.456803e+09,POINT (-4.6542068 48.12314),1.905561,2016-03-01 03:26:00
9,1.754236e+07,227222000.0,5.0,0.000000,0.000000,167.400000,167.000000,1.456803e+09,POINT (-4.477003 48.38212),0.000000,2016-03-01 03:26:00


In [None]:
sample_timeFrame.to_csv('sample_timeFrame.csv', index=False)

## Search for Flocks/Convoys
* #### 1. The Clustering will take place in time slices 
* #### 2. Possible Features: (X_coord, Y_coord, course)

* #### Set up a Color Map (for the Plots)

In [15]:
LABEL_COLOR_MAP = {-1 : 'black', 0 : 'white', 1 : 'r', 2 : 'g',\
                   3 : 'b', 4 : 'm', 5 : 'y', 6 : 'maroon', 7 : 'pink',\
                   8 : 'sienna', 9 : 'darkslategray', 10 : 'purple', 
                   11 : 'darkgoldenrod', 12: 'chocolate'}

* ### Search for Convoys (via KMeans Clustering; Using the Euclidean Distance as a Distance Metric)

In [47]:
normalizing_algorithm = partial(gsgp.MinMax_Scaler, feature_range=(0, 1), copy=True)
clustering_algorithm = partial(gsgp.KMeans_Clustering, init='k-means++', n_init=10, n_jobs=-1, precompute_distances=True, random_state=0, verbose=0) 
verification_process = gsgp.cmc_flock_verification

flocks = gsgp.group_patterns_mining(sample_timeFrame, normalizing_algorithm, clustering_algorithm, verification_process, mode='flocks', time_threshold=5, min_samples=2, resampling_rate=60)
flocks

Datetime of Interest: 2016-03-01 05:25:00

Unnamed: 0,flocks,start_time,end_time
0,"{636015106.0, 228064900.0, 227574020.0, 412069...",2016-03-01 05:07:00,2016-03-01 05:25:00
1,"{227311000.0, 228190600.0}",2016-03-01 05:07:00,2016-03-01 05:25:00
2,"{226216000.0, 228236600.0}",2016-03-01 05:09:00,2016-03-01 05:25:00
3,"{227941000.0, 227114630.0, 227162950.0}",2016-03-01 05:09:00,2016-03-01 05:25:00
4,"{228849000.0, 228236700.0}",2016-03-01 05:09:00,2016-03-01 05:25:00
5,"{227300000.0, 228394000.0}",2016-03-01 05:09:00,2016-03-01 05:25:00


* #### Plotting the Results of Flock Mining...

In [32]:
for doi in flocks.start_time.unique():
    label_color = pd.DataFrame([], index=sample_timeFrame.loc[sample_timeFrame.datetime == doi].index, columns=['color'])

    cluster_indices = [sample_timeFrame.loc[sample_timeFrame.mmsi.isin(flock)].index for flock in flocks.loc[flocks.start_time == doi].flocks] 
    for color_idx, cluster in enumerate(cluster_indices):
        label_color.loc[label_color.index.isin(cluster), 'color'] = color_idx

    ax = sample_timeFrame.loc[sample_timeFrame.datetime == doi].to_crs(epsg=3857).plot(figsize=(10, 10), c=label_color.color.values)
    ctx.add_basemap(ax, zoom=11)
    plt.show()

In [81]:
for mmsi, gdf in sample_timeFrame.groupby(['mmsi'], as_index=False):
#     gsplt.map_plot(gdf)
    plt.xlabel(f'mmsi: {mmsi}')
    gdf.plot()
    plt.show()



### __Q:__ sklearn KMeans uses Euclidean Distance; Which is not Good on GeoSpatial Data. What if we could incorporate the Haversine Formula to get a bit more Accurate Clusters?

### __A:__ Re-project the Data to a Mercator in which Euclidean Distance can be Applied

* ### Search for Convoys (via DBSCAN Clustering; Using the Haversine Distance as a Distance Metric)

In [49]:
normalizing_algorithm = partial(gsgp.MinMax_Scaler, feature_range=(0, 1), copy=True)
clustering_algorithm = partial(gsgp.DBSCAN_Clustering, eps=2.5, min_samples=3, metric=haversine, metric_params=None, algorithm='auto', leaf_size=50, p=None, n_jobs=-1) 
verification_process = gsgp.cmc_convoy_verification

convoys = gsgp.group_patterns_mining(sample_timeFrame, normalizing_algorithm, clustering_algorithm, verification_process, mode='convoys', time_threshold=5, resampling_rate=60, min_samples=3) 
convoys

Datetime of Interest: 2016-03-01 05:25:00

Unnamed: 0,convoys,start_time,end_time
0,"{227016100.0, 227003050.0, 227008170.0, 244630...",2016-03-01 03:26:00,2016-03-01 05:25:00
1,"{249297000.0, 477115900.0, 256494000.0}",2016-03-01 03:26:00,2016-03-01 05:25:00
2,"{227148000.0, 227574020.0, 227705102.0}",2016-03-01 03:26:00,2016-03-01 05:25:00
3,"{234056000.0, 228064900.0, 228186700.0}",2016-03-01 03:28:00,2016-03-01 05:25:00
4,"{227941000.0, 227114630.0, 227162950.0}",2016-03-01 03:43:00,2016-03-01 05:25:00


* #### Plotting the Results of Convoy Mining...

In [53]:
for doi in convoys.start_time.unique():
    label_color = pd.DataFrame([], index=sample_timeFrame.loc[sample_timeFrame.datetime == doi].index, columns=['color'])
    
    cluster_indices = [sample_timeFrame.loc[sample_timeFrame.mmsi.isin(convoy)].index for convoy in convoys.loc[convoys.start_time == doi].convoys] 
    for color_idx, cluster in enumerate(cluster_indices):
        label_color.loc[label_color.index.isin(cluster), 'color'] = color_idx
    
    ax = sample_timeFrame.loc[sample_timeFrame.datetime == doi].to_crs(epsg=3857).plot(figsize=(10, 10), c=label_color.color.values)
    ctx.add_basemap(ax, zoom=11)
    plt.show()

# EXECUTE THIS TO CLOSE ALL PYPLOT FIGURES

In [82]:
plt.close(fig='all')

# FUNCTIONS

In [1]:
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [2]:
def make_lines(gdf, df_out, i, geometry = 'geometry'):
    geom0 = gdf.loc[i][geometry]
    geom1 = gdf.loc[i + 1][geometry]
    
    start, end = [(geom0.x, geom0.y), (geom1.x, geom1.y)]
    line = LineString([start, end])
    
    # Create a DataFrame to hold record
    data = {'id': i,
            'geometry': [line]}
    df_line = pd.DataFrame(data, columns = ['id', 'geometry'])
    
    # Add record DataFrame of compiled records
    df_out = pd.concat([df_out, df_line])
    return df_out

In [3]:
def mean_distance_to_nearest_port(gdf, ports):
    '''
    Calculates the minimum distance between the point and the lists of ports. Can be used to determine if the ship is sailing or not
    '''
    counter = 0
    for point in tqdm(gdf.geom):
        counter += ports.geom.distance(point).min()
    
    return counter/len(gdf)