# FUNCTIONS

In [1]:
def index_of_cluster(item, cluster_list):
    position = [ind for ind, subl in enumerate(cluster_list) if item in subl]
    if len(position)>1:
        raise ValueError
    if not position:
        return [-1]
    else:
        return position


def connected_edges(data):
    G = nx.Graph()
    G.add_edges_from(data)
    return [list(cluster) for cluster in nx.connected_components(G)]


def pairs_in_radius(df, radius):
    distances = np.triu(distance_matrix(df[['lat', 'lon']].values, df[['lat', 'lon']].values), 0)
#     distances = sample_timeFrame[['lon', 'lat']].T.apply(lambda A: sample_timeFrame[['lon', 'lat']].T.apply(lambda B: haversine((A[0], A[1]), (B[0], B[1]))))
    distances = np.triu(distances, 0)
    distances[distances == 0] = np.inf
    return np.vstack(np.where(distances<=radius)).T


def get_flock_labels(timeframe,radius):
    timeframe.reset_index(drop=True , inplace=True)
    data = pairs_in_radius(timeframe, radius)
    clusters = connected_edges(data)
    timeframe['flock_label'] = timeframe.apply( lambda x: index_of_cluster(x.name, clusters)[0], axis=1)
    return timeframe


def flocks(df,radius):
    df['flock_label'] = np.nan
    df =  df.groupby('datetime', as_index=False).apply(get_flock_labels, radius)
    return df

In [2]:
def point_from_lat_lon(df_w_lat_lon):
    df_w_lat_lon['geom'] = np.nan
    df_w_lat_lon['geom'] = df_w_lat_lon[['lon', 'lat']].apply(lambda x: Point(x), axis=1)
    return gpd.GeoDataFrame(df_w_lat_lon, geometry='geom')

In [3]:
def get_correct_label(present, future):
    lst = future.loc[future.mmsi.isin(present.mmsi)].flock_label.value_counts().index
    if lst[0] != -1 or len(lst)==1:
        return lst[0]
    else:
        return lst[1]

In [4]:
def swap(x, pair):
    return pair[pair.index(x)-1] if x in pair else x

In [5]:
def window(iterable, size=2):
    i = iter(iterable)
    win = []
    for e in range(0, size):
        win.append(next(i))
    yield win

    for e in i:
        win = win[1:] + [e]
        yield win

# LIBRARIES

In [6]:
## Importing our Library
import os, sys
sys.path.append(os.path.join(os.path.expanduser('~'), 'Documents', 'Insert-Generic-Name-Here'))
from lonelyboy.geospatial import plots as gsplt
from lonelyboy.geospatial import preprocessing as gspp
from lonelyboy.timeseries import lbtimeseries as tspp
from lonelyboy.geospatial import group_patterns as gsgp
from lonelyboy.geospatial import moving_patterns as gsmp

## Importing all other Essential Libraries
import psycopg2
import numpy as np
import configparser
import pandas as pd
import geopandas as gpd
import contextily as ctx
from random import choice
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.base import clone
from sklearn.cluster import DBSCAN, KMeans, MeanShift
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from shapely.geometry import Point, LineString, shape
from haversine import haversine
from datetime import datetime, timedelta

from multiprocessing import cpu_count, Pool
from functools import partial
import datetime
import pandas as pd                                                                                                                                                                                            
import numpy as np
import networkx as nx
from scipy.spatial import distance_matrix

import PyQt5
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')
get_ipython().magic('matplotlib qt')

# CODE

In [7]:
sample_timeFrame = pd.read_csv('data/4h_timeFrame_not_at_3_fing_am.csv')
sample_timeFrame = point_from_lat_lon(sample_timeFrame)
sample_timeFrame.crs = {'init': 'epsg:4326'}
sample_timeFrame.head()

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom,velocity,datetime
0,18848090.0,227408710.0,0.0,0.0,0.0,258.1,511.0,-4.490398,48.379186,1456818000.0,POINT (-4.4903984 48.3791865),0.013741,2016-03-01 07:36:00
1,17585720.0,228853000.0,7.0,-127.0,6.169685,200.775984,511.0,-3.891684,47.73267,1456818000.0,POINT (-3.891684385826772 47.73266998031496),0.153449,2016-03-01 07:36:00
2,18848080.0,227686540.0,0.0,0.0,6.4,199.3,511.0,-4.464395,48.32778,1456818000.0,POINT (-4.464395 48.32778),6.91179,2016-03-01 07:36:00
3,17517930.0,226216000.0,7.0,0.0,0.0,196.639167,302.466667,-4.175265,47.835712,1456818000.0,POINT (-4.175265398333333 47.83571188333333),0.001396,2016-03-01 07:36:00
4,17517900.0,227270000.0,5.0,-127.0,0.0,0.0,511.0,-4.478274,48.383153,1456818000.0,POINT (-4.478274466666667 48.38315333333333),0.003177,2016-03-01 07:36:00


In [8]:
%%time
flocks_cluster = flocks(sample_timeFrame, 0.05)
flocks_cluster_filtered = flocks_cluster.drop(columns=['id', 'status', 'turn', 'speed', 'course', 'heading', 'lon', 'lat', 'ts', 'velocity'], axis=1).reset_index(drop=True)
flocks_cluster_filtered.crs = {'init': 'epsg:4326'}

tflst = []
for _, df in flocks_cluster_filtered.groupby('datetime'):
    tflst.append(df)

CPU times: user 1.15 s, sys: 4.14 ms, total: 1.15 s
Wall time: 1.18 s


In [9]:
def fix2(df):
    grouped = df.groupby('datetime')
    for  ind, (ts, group) in enumerate(list(grouped)[:-1]):
        print (ind, end='\r')
        for label, present in group.groupby('flock_label'):
            future = list(grouped)[ind+1][1]
            new_label = get_correct_label(present, future)
            df.loc[df.datetime == future.iloc[0].datetime, 'flock_label'] = future['flock_label'].apply(swap, args=((new_label, label),))
    return df

In [10]:
ndf = fix2(flocks_cluster_filtered)
ndf.head(20)

239

Unnamed: 0,mmsi,geom,datetime,flock_label
0,227408710.0,POINT (-4.4903984 48.3791865),2016-03-01 07:36:00,0
1,228853000.0,POINT (-3.891684385826772 47.73266998031496),2016-03-01 07:36:00,-1
2,227686540.0,POINT (-4.464395 48.32778),2016-03-01 07:36:00,0
3,226216000.0,POINT (-4.175265398333333 47.83571188333333),2016-03-01 07:36:00,-1
4,227270000.0,POINT (-4.478274466666667 48.38315333333333),2016-03-01 07:36:00,0
5,249297000.0,POINT (-4.454358866666666 48.38522166666666),2016-03-01 07:36:00,0
6,227142200.0,POINT (-4.327391599999999 48.09936),2016-03-01 07:36:00,1
7,227222000.0,POINT (-4.477003 48.38212),2016-03-01 07:36:00,0
8,227300000.0,POINT (-4.6446233 48.10234000000001),2016-03-01 07:36:00,2
9,227632830.0,POINT (-4.4594316 48.32465999999999),2016-03-01 07:36:00,0


In [11]:
%%time
ndf.rename(columns={'flock_label':'cluster_label'}, inplace=True)
flocks = gsgp.group_patterns_mining(ndf, mode='flocks')

CPU times: user 2.62 s, sys: 48.5 ms, total: 2.67 s
Wall time: 2.53 s


In [12]:
flocks

Unnamed: 0,flocks,start,end
0,"(220417000.0, 226263000.0, 227002330.0, 227003...",2016-03-01 07:36:00,2016-03-01 07:54:00
1,"(227114630.0, 227142200.0, 227162950.0, 227941...",2016-03-01 07:36:00,2016-03-01 08:22:00
2,"(220417000.0, 226263000.0, 227002330.0, 227006...",2016-03-01 07:50:00,2016-03-01 08:01:00
3,"(227003050.0, 228762000.0, 256494000.0)",2016-03-01 07:50:00,2016-03-01 07:56:00
4,"(220417000.0, 226263000.0, 227002330.0, 227006...",2016-03-01 07:57:00,2016-03-01 08:10:00
5,"(220417000.0, 227002330.0, 227006750.0, 227008...",2016-03-01 08:06:00,2016-03-01 08:13:00
6,"(220417000.0, 227006750.0, 227008170.0, 227016...",2016-03-01 08:09:00,2016-03-01 08:17:00
7,"(220417000.0, 227006750.0, 227008170.0, 227016...",2016-03-01 08:13:00,2016-03-01 08:29:00
8,"(227114630.0, 227142200.0, 227162950.0, 227941...",2016-03-01 08:18:00,2016-03-01 11:37:00
9,"(227006750.0, 227008170.0, 227016100.0, 227088...",2016-03-01 08:25:00,2016-03-01 09:06:00


In [171]:
properties = configparser.ConfigParser()
properties.read(os.path.join('.','sql_server.ini'))
properties = properties['SERVER']

host    = properties['host']
db_name = properties['db_name']
uname   = properties['uname']
pw      = properties['pw']
port    = properties['port']

ports_sql = 'SELECT * FROM ports.ports_of_brittany'

con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port = port)
# traj = gpd.GeoDataFrame.from_postgis(traj_sql, con, geom_col='geom' )
ports = gpd.GeoDataFrame.from_postgis(ports_sql, con, geom_col='geom' )
con.close()

ports.geom = ports.geom.apply(lambda x: x[0])

In [172]:
ports.head()

Unnamed: 0,gid,gml_id,por_id,libelle_po,insee_comm,por_x,por_y,geom
0,1,port.1,1,Le Vivier-sur-Mer,35361,297025.0,2408370.0,POINT (-1.771798868659233 48.60274269672541)
1,2,port.10,10,Saint-Samson sur Rance,22327,279335.0,2396060.0,POINT (-2.001990119062326 48.48369993456267)
2,3,port.100,100,Douarnenez,29046,103135.0,2365330.0,POINT (-4.341204251638414 48.09709590770091)
3,4,port.101,101,Brézellec,29028,79105.4,2364190.0,POINT (-4.661115947908725 48.06804110561076)
4,5,port.102,102,Sein,29083,64562.5,2362180.0,POINT (-4.852944548180974 48.03825273921113)


In [212]:
pd.set_option('display.max_colwidth', 1)
flocks.loc[(flocks.start >= pd.Timestamp(1456820940, unit='s')) & (flocks.start <= pd.Timestamp(1456830940, unit='s'))]

Unnamed: 0,flocks,start,end
10,"(226263000.0, 228762000.0, 234056000.0)",2016-03-01 08:50:00,2016-03-01 09:08:00
11,"(227006750.0, 227008170.0, 227016100.0, 227088590.0, 227148000.0, 227222000.0, 227270000.0, 227298110.0, 227303430.0, 227315110.0, 227322670.0, 227322690.0, 227369960.0, 227408710.0, 227574020.0, 227578460.0, 227588970.0, 227589520.0, 227590030.0, 227592820.0, 227611930.0, 227612860.0, 227631450.0, 227632830.0, 227635210.0, 227639660.0, 227640710.0, 227641920.0, 227650230.0, 227654220.0, 227666970.0, 227686540.0, 227690560.0, 227705102.0, 227730220.0, 227806500.0, 228005700.0, 228064900.0, 228186700.0, 228796000.0, 244630187.0, 249297000.0, 412069000.0, 477115900.0, 636015106.0)",2016-03-01 09:02:00,2016-03-01 09:39:00
12,"(228762000.0, 234056000.0, 256494000.0)",2016-03-01 09:05:00,2016-03-01 09:33:00
13,"(228762000.0, 228849000.0, 234056000.0, 256494000.0)",2016-03-01 09:11:00,2016-03-01 09:18:00
14,"(220417000.0, 228762000.0, 234056000.0, 256494000.0)",2016-03-01 09:29:00,2016-03-01 09:34:00
15,"(220417000.0, 228762000.0, 234056000.0)",2016-03-01 09:30:00,2016-03-01 09:41:00
16,"(227006750.0, 227008170.0, 227016100.0, 227088590.0, 227148000.0, 227222000.0, 227270000.0, 227298110.0, 227303430.0, 227315110.0, 227322670.0, 227322690.0, 227369960.0, 227408710.0, 227574020.0, 227578460.0, 227588970.0, 227589520.0, 227590030.0, 227592820.0, 227611930.0, 227612860.0, 227631450.0, 227632830.0, 227635210.0, 227639660.0, 227640710.0, 227641920.0, 227650230.0, 227666970.0, 227686540.0, 227690560.0, 227705102.0, 227730220.0, 227806500.0, 228005700.0, 228064900.0, 228186700.0, 228796000.0, 244630187.0, 249297000.0, 412069000.0, 477115900.0, 636015106.0)",2016-03-01 09:35:00,2016-03-01 09:53:00
17,"(227006750.0, 227008170.0, 227016100.0, 227088590.0, 227148000.0, 227222000.0, 227270000.0, 227298110.0, 227303430.0, 227315110.0, 227322670.0, 227322690.0, 227369960.0, 227408710.0, 227574020.0, 227578460.0, 227588970.0, 227589520.0, 227590030.0, 227592820.0, 227611930.0, 227612860.0, 227631450.0, 227632830.0, 227635210.0, 227639660.0, 227640710.0, 227650230.0, 227666970.0, 227686540.0, 227690560.0, 227705102.0, 227730220.0, 227806500.0, 228005700.0, 228064900.0, 228186700.0, 228796000.0, 244630187.0, 249297000.0, 412069000.0, 477115900.0, 636015106.0)",2016-03-01 09:49:00,2016-03-01 10:06:00
18,"(227006750.0, 227008170.0, 227016100.0, 227088590.0, 227148000.0, 227222000.0, 227270000.0, 227298110.0, 227303430.0, 227322670.0, 227322690.0, 227369960.0, 227408710.0, 227574020.0, 227578460.0, 227588970.0, 227589520.0, 227590030.0, 227592820.0, 227611930.0, 227612860.0, 227631450.0, 227632830.0, 227635210.0, 227639660.0, 227640710.0, 227650230.0, 227666970.0, 227686540.0, 227690560.0, 227705102.0, 227730220.0, 227806500.0, 228005700.0, 228064900.0, 228186700.0, 228796000.0, 244630187.0, 249297000.0, 412069000.0, 477115900.0, 636015106.0)",2016-03-01 10:02:00,2016-03-01 10:10:00
19,"(227006750.0, 227008170.0, 227016100.0, 227088590.0, 227148000.0, 227222000.0, 227270000.0, 227298110.0, 227303430.0, 227322670.0, 227369960.0, 227408710.0, 227574020.0, 227578460.0, 227588970.0, 227589520.0, 227590030.0, 227592820.0, 227611930.0, 227612860.0, 227631450.0, 227632830.0, 227635210.0, 227639660.0, 227640710.0, 227650230.0, 227666970.0, 227686540.0, 227690560.0, 227705102.0, 227730220.0, 227806500.0, 228005700.0, 228064900.0, 228186700.0, 228796000.0, 244630187.0, 249297000.0, 412069000.0, 477115900.0, 636015106.0)",2016-03-01 10:06:00,2016-03-01 10:11:00


In [184]:
# pd.set_option('display.max_colwidth')
# pd.reset_option('display.max_colwidth')

print(flocks.loc[7].flocks)
print(flocks.loc[7].start)
print(flocks.loc[7].end)
ndf.loc[(ndf.mmsi.isin(flocks.loc[7].flocks)) & (ndf.datetime == str(flocks.loc[7].start))]

(227322670.0, 227322690.0, 227578460.0, 227590030.0, 227611930.0, 227631450.0, 227639660.0, 227640710.0, 227641920.0, 227650230.0, 227686540.0, 227690560.0)
2016-03-01 07:45:00
2016-03-01 08:29:00


Unnamed: 0,mmsi,geom,datetime,flock_label
666,227322690.0,POINT (-4.465199333333334 48.32548766666667),2016-03-01 07:45:00,3
671,227322670.0,POINT (-4.4636984 48.30918),2016-03-01 07:45:00,3
689,227690560.0,POINT (-4.466505000000001 48.325367),2016-03-01 07:45:00,3
691,227611930.0,POINT (-4.4666615 48.309265),2016-03-01 07:45:00,3
692,227641920.0,POINT (-4.4650183 48.317814),2016-03-01 07:45:00,3
694,227639660.0,POINT (-4.46712 48.30873579999999),2016-03-01 07:45:00,3
705,227650230.0,POINT (-4.467043400000001 48.310085),2016-03-01 07:45:00,3
718,227590030.0,POINT (-4.45355 48.312744),2016-03-01 07:45:00,3
720,227640710.0,POINT (-4.4577084 48.3181),2016-03-01 07:45:00,3
723,227686540.0,POINT (-4.467355 48.312546),2016-03-01 07:45:00,3


In [182]:
# pd.set_option('display.max_colwidth')
# pd.reset_option('display.max_colwidth')

print(flocks.loc[8].flocks, '\n\n')
for i in range(len(flocks)):
    tmp = ndf.loc[(ndf.mmsi.isin(flocks.loc[i].flocks)) & (ndf.datetime == str(flocks.loc[i].start))].geom
    print (f'Iter: {i}, #Ships: {len(tmp)}, MeanDist:{tmp.apply(gspp.distance_to_nearest_port, args=(ports,)).mean()}')

(226263000.0, 227002330.0, 227006750.0, 227008170.0, 227016100.0, 227088590.0, 227148000.0, 227222000.0, 227270000.0, 227298110.0, 227369960.0, 227408710.0, 227574020.0, 227592820.0, 227612860.0, 227635210.0, 227730220.0, 227806500.0, 228796000.0, 244630187.0, 412069000.0) 


Iter: 0, #Ships: 20, MeanDist:0.01381202186327182
Iter: 1, #Ships: 3, MeanDist:0.014077797155365388
Iter: 2, #Ships: 4, MeanDist:0.039788764551440245
Iter: 3, #Ships: 9, MeanDist:0.05137497264987581
Iter: 4, #Ships: 11, MeanDist:0.051605692351094024
Iter: 5, #Ships: 13, MeanDist:0.05159072844020249
Iter: 6, #Ships: 6, MeanDist:0.028284929591582145
Iter: 7, #Ships: 12, MeanDist:0.05184943258992442
Iter: 8, #Ships: 21, MeanDist:0.014141497326446806
Iter: 9, #Ships: 7, MeanDist:0.027513139772254926
Iter: 10, #Ships: 20, MeanDist:0.013852488081517896
Iter: 11, #Ships: 19, MeanDist:0.014006563087170387
Iter: 12, #Ships: 18, MeanDist:0.01415342809766721
Iter: 13, #Ships: 5, MeanDist:0.025639731654538127
Iter: 14, #Ships

In [20]:
ndf.crs = {'init': 'epsg:4326'}

#plots
for cnt, (ind, x) in enumerate(ndf.groupby('datetime')):
    if cnt>20: break
    try: 
        #gsplt.map_plot(x.loc[x.flock_label == 0])
        ax = x.to_crs(epsg=3857).plot(figsize=(20,20), column='flock_label', cmap="tab20")
        ctx.add_basemap(ax, zoom=10)
        plt.show()
    except:
        continue

# PLAYGROUND

In [None]:
def euclid(x_0, y_0, x_1, y_1):
    return ((x_0 - x_1)**2 + (y_0 - y_1)**2)**(1/2)
def calculate_haversine(latA, lonA, latB, lonB):
    return haversine((latA, lonA), (latB, lonB))


d = pd.DataFrame({'a' : [0, 1, 2], 
                  'b' : [1, 2, 3]}, columns=['a', 'b'])
d.T.apply(lambda x: d.T.apply(lambda y: haversine((x[0], x[1]), (y[0], y[1]))))
d.T.apply(lambda x: euclid(x[0], x[1], d['a'], d['b']))

In [None]:
cluster_history.loc[cluster_history.apply(len) <= 100].index

In [None]:
time_threshold = 5

# old_clhst = len(cluster_history)
# # Clear the Redundant Entries
# cluster_history = cluster_history.loc[cluster_history.apply(len) >= time_threshold]
# print (f'Entries Cleared: {old_clhst - len(cluster_history)}')

# cluster_history_window = cluster_history.apply(lambda x: pd.DataFrame(x).rolling(time_threshold))
cluster_history_window = cluster_history.apply(lambda x: window(x, time_threshold))

In [None]:
tmp = cluster_history_window.apply(next)

In [None]:
tmp

In [None]:
tmp2 = tmp.apply(tuple)
# tmp2
for _, Y in tmp2.groupby(tmp2):
#     print (list(Y.index))
    print (Y)    

In [None]:
cluster_history = ndf.sort_values('datetime').groupby('mmsi')['flock_label'].apply(list)
cluster_history

In [None]:
cluster_dates = ndf.sort_values('datetime').groupby('mmsi')['datetime'].apply(min)
cluster_dates

In [None]:
cluster_history.loc[cluster_history.datetime == cluster_history.datetime.min()]

In [None]:
cluster_history_window = ndf.groupby('mmsi').agg({'flock_label': lambda x: window(list(x), time_threshold), 'datetime': lambda x: pd.Timestamp(min(x))})
cluster_history_window

In [None]:
cluster_history_window.at[:, 'datetime'] = cluster_history_window['datetime'].apply(lambda x: x + pd.offsets.Minute(1))
cluster_history_window

In [None]:
cluster_history_window['datetime'] += pd.offsets.Minute(1)
# = cluster_history_window['datetime'].apply(lambda x: x )

In [None]:
cluster_history_window

In [None]:
cluster_history = ndf.groupby('mmsi').agg({'flock_label': lambda x: list(x), 'datetime': 'min'})
cluster_history

In [None]:
cluster_history.at[:, 'flock_label'] = cluster_history['flock_label'].apply(lambda x: window(x, 5))
cluster_history

In [None]:
def hasNext(x):
    try:
        return next(x)
    except StopIteration:
        return []

In [None]:
flocks = pd.DataFrame([], columns=['flocks', 'start', 'end'])
time_threshold = 5
min_samples = 3

cluster_history_window = ndf.groupby('mmsi').agg({'flock_label': lambda x: window(list(x), time_threshold), 'datetime': lambda x: pd.Timestamp(min(x))})

while (len(cluster_history_window) != 0):
    # Set the Start of Time
    startTime = cluster_history_window.datetime.min()
    endTime = startTime + pd.offsets.Minute(time_threshold)
    print (f'Datetime of Interest: {startTime}', end='\r')
    
    # Get the History Window according to the above Timestamps
    timeFrameClusters = cluster_history_window.loc[cluster_history_window.datetime == startTime]['flock_label'].apply(lambda x: hasNext(x)).apply(tuple)
    # Group by the History Window
    for label_hist_window, mmsis in timeFrameClusters.groupby(timeFrameClusters):
        if ((len(mmsis.index) >= min_samples) and (-1 not in label_hist_window)):
            foi = flocks.loc[flocks.flocks.apply(tuple) == tuple(mmsis.index)]
#             if (-1 in label_hist_window):
#                 continue
                # TODO - Refine Here the End Timestamp for Existing Flocks (Minor)
#             else:
            if (len(foi) != 0):
                flocks.at[foi.index[0], 'end'] = endTime
            else:
                newFlockRow = pd.DataFrame([{'flocks': tuple(mmsis.index),  'start': startTime, 'end': endTime}], columns=['flocks', 'start', 'end'])                     
                flocks = flocks.append(newFlockRow, ignore_index=True)
    
    # Prepare for Next Iteration
    #     * Clean the Redundant Records
    ioi = timeFrameClusters.loc[timeFrameClusters.apply(len) == 0].index
    cluster_history_window.drop(list(ioi), inplace=True)
    #     * Refresh the Start of Time Timestamp
    cluster_history_window['datetime'] += pd.offsets.Minute(1)
    
    # TODO - Handle End-Of-Time Gracefully (Minor)
    if (startTime > pd.Timestamp(ndf.datetime.max())):
        break