In [12]:
import pandas as pd
import numpy as np
from haversine import haversine
import networkx as nx
from tqdm import tqdm_notebook as tqdm

df = pd.read_csv('4h_timeFrame_not_at_3_fing_am.csv')

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
df.head()

Unnamed: 0,id,mmsi,status,turn,speed,course,heading,lon,lat,ts,geom,velocity,datetime
0,18848090.0,227408710.0,0.0,0.0,0.0,258.1,511.0,-4.490398,48.379186,1456818000.0,POINT (-4.4903984 48.3791865),0.013741,2016-03-01 07:36:00
1,17585720.0,228853000.0,7.0,-127.0,6.169685,200.775984,511.0,-3.891684,47.73267,1456818000.0,POINT (-3.891684385826772 47.73266998031496),0.153449,2016-03-01 07:36:00
2,18848080.0,227686540.0,0.0,0.0,6.4,199.3,511.0,-4.464395,48.32778,1456818000.0,POINT (-4.464395 48.32778),6.91179,2016-03-01 07:36:00
3,17517930.0,226216000.0,7.0,0.0,0.0,196.639167,302.466667,-4.175265,47.835712,1456818000.0,POINT (-4.175265398333333 47.83571188333333),0.001396,2016-03-01 07:36:00
4,17517900.0,227270000.0,5.0,-127.0,0.0,0.0,511.0,-4.478274,48.383153,1456818000.0,POINT (-4.478274466666667 48.38315333333333),0.003177,2016-03-01 07:36:00


In [4]:
def pairs_in_radius(df, diam=1000):
    res = []
    for ind_i, ind_j, val_i, val_j in nparray_combinations(df):
        dist = haversine(val_i, val_j)*1000
        if (dist<diam):
            res.append((ind_i,ind_j))   
    return res
            

In [5]:
def connected_edges(data):
    G = nx.Graph()
    G.add_edges_from(data)
    return [sorted(list(cluster)) for cluster in nx.find_cliques(G)]

In [6]:
def nparray_combinations(arr):
    for i in range(arr.shape[0]):
        for j in range(i+1, arr.shape[0]):
            yield i, j, arr[i,:], arr[j,:]

In [7]:
def eval_candidate(candidate, pairs):
    for pair in itertools.combinations(candidate,2):
        if pair not in pairs:
            return False
    return True

In [8]:
def translate(sets, sdf):
    return [sorted(tuple([sdf.iloc[point].mmsi for point in points])) for points in sets]

In [9]:
def circle_cluster(timeframe, min_cardinality = 2):
    pairs = pairs_in_radius(timeframe[['lon', 'lat']].values)
    full_sets = connected_edges(pairs)
    clusters = []
    for individual_sets in full_sets:
        # MAYBE FROM BIG TO SMALL INSTEAD OF SMALL TO BIG
        for j in range(min_cardinality,len(individual_sets)+1):
            for candidate in itertools.combinations(individual_sets,j):
                 if eval_candidate(candidate, pairs):
                        clusters.append(sorted(candidate))
    return clusters

In [10]:
def get_circular_clusters(timeframe):
    pairs = pairs_in_radius(timeframe[['lon', 'lat']].values)
    return connected_edges(pairs)
    

In [11]:
def find_existing_flocks(x, present, past, last_ts):
    # find the indices of past Dataframe where current cluster is subset of flock
    indcs = [set(x.clusters) < set(val) for val in past.loc[past.et==last_ts].clusters.values]
    # get the indices of the past dataframe where that occurs
    past.loc[(indcs)].index.tolist()

def replace_with_existing_flocks(x, to_keep, past):
    
    if to_keep.iloc[x.name]:
        if len(past.iloc[to_keep.iloc[x.name]])>1:
            raise Exception('len > 1, something is wrong')

        x.dur = past.iloc[to_keep.iloc[x.name]].dur.values[0] +1
        x.st = past.iloc[to_keep.iloc[x.name]].st.values[0]
    return x

def get_current_clusters(sdf, ts):
    present = pd.DataFrame([[tuple(val)] for (val) in translate(get_circular_clusters(sdf), sdf)], columns=['clusters'])
    present['st'] = present['et'] = ts
    present['dur'] = 1
    return present

def present_is_subset_of_past(present, past, last_ts):
    
    to_keep = present.apply(find_existing_flocks, args=(present,past,last_ts,) , axis=1)
#     print(to_keep)
    present = present.apply(replace_with_existing_flocks, args=(to_keep,past,), axis=1)
    
    return present


def past_is_subset_or_set_of_present(present, past, ts, last_ts):
#     get if tuple of tmp1 is subset or equal of a row of tmp2
    ## Present > Past
    to_keep = past.apply(lambda x: (True in [set(x.clusters) <= set(val) for val in present.clusters.values]) and (x.et == last_ts), axis=1)

#     df_to_keep = past.loc[to_keep]

    past.loc[to_keep,'et'] = ts
    past.loc[to_keep,'dur']= past.loc[to_keep].dur.apply(lambda x : x+1)
    return past

def get_flocks(new_clusters, clusters_to_keep):
    return pd.concat([new_clusters,clusters_to_keep]).reset_index(drop=True)
    

In [13]:
log = []
keep = pd.DataFrame()

for ind, (ts, sdf) in tqdm(enumerate(df.groupby('datetime')), total=df.datetime.nunique()):

    present = get_current_clusters(sdf, ts)
    if ind == 0:
        past = present
        last_ts = ts
        continue
        
    new_subsets = present_is_subset_of_past(present, past, last_ts)
    
    old_subsets_or_sets = past_is_subset_or_set_of_present(present, past, ts, last_ts)

#     past = get_flocks(new_subsets, old_subsets_or_sets)

#     past = past.loc[(past.et==ts) | (past.dur>10)] 
    print(sum(new_subsets['clusters'].duplicated()),sum(old_subsets_or_sets['clusters'].duplicated()),sum(past['clusters'].duplicated()))
    last_ts = ts
    if ind==1:
        break
# fn_keep = keep.drop_duplicates(['clusters'], keep='last')

HBox(children=(IntProgress(value=0, max=241), HTML(value='')))

0 0 0



In [14]:
new_subsets`

Unnamed: 0,clusters,st,et,dur
0,"(227632830.0, 227686540.0)",2016-03-01 07:36:00,2016-03-01 07:36:00,1
1,"(227322670.0, 227686540.0)",2016-03-01 07:36:00,2016-03-01 07:37:00,2
2,"(226263000.0, 227002330.0, 227006750.0, 227008...",2016-03-01 07:36:00,2016-03-01 07:37:00,2
3,"(227002330.0, 227006750.0, 227016100.0, 227222...",2016-03-01 07:36:00,2016-03-01 07:37:00,2
4,"(249297000.0, 477115900.0)",2016-03-01 07:36:00,2016-03-01 07:37:00,2
5,"(227142200.0, 227162950.0, 227941000.0)",2016-03-01 07:36:00,2016-03-01 07:37:00,2
6,"(227303430.0, 227632830.0)",2016-03-01 07:36:00,2016-03-01 07:37:00,2
7,"(227578460.0, 227611930.0, 227639660.0, 227650...",2016-03-01 07:36:00,2016-03-01 07:37:00,2
8,"(227322670.0, 227611930.0, 227639660.0, 227650...",2016-03-01 07:36:00,2016-03-01 07:37:00,2
9,"(227588970.0, 227590030.0)",2016-03-01 07:36:00,2016-03-01 07:37:00,2


In [81]:
new_subsets

Unnamed: 0,clusters,st,et,dur
0,"(226263000.0, 227002330.0, 227006750.0, 227008...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
1,"(227002330.0, 227006750.0, 227016100.0, 227222...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
2,"(227322670.0, 227578460.0, 227611930.0, 227639...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
3,"(227578460.0, 227611930.0, 227631450.0, 227639...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
4,"(227322670.0, 227639660.0, 227686540.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
5,"(227142200.0, 227162950.0, 227941000.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
6,"(227640710.0, 227654220.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
7,"(227315110.0, 227666970.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
8,"(227641920.0, 227686540.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
9,"(227588970.0, 227590030.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1


In [None]:
def tstfunc(x, past):
    if x:
        return past.iloc[x].st, past.iloc[x].dur

In [31]:
past.loc[past.dur==11]

Unnamed: 0,clusters,st,et,dur
224,"(227322670.0, 227686540.0)",2016-03-01 07:36:00,2016-03-01 07:46:00,11
225,"(226263000.0, 227002330.0, 227006750.0, 227008...",2016-03-01 07:36:00,2016-03-01 07:46:00,11
226,"(227002330.0, 227006750.0, 227016100.0, 227222...",2016-03-01 07:36:00,2016-03-01 07:46:00,11
227,"(249297000.0, 477115900.0)",2016-03-01 07:36:00,2016-03-01 07:46:00,11
228,"(227142200.0, 227162950.0, 227941000.0)",2016-03-01 07:36:00,2016-03-01 07:46:00,11
229,"(227303430.0, 227632830.0)",2016-03-01 07:36:00,2016-03-01 07:46:00,11
230,"(227578460.0, 227611930.0, 227639660.0, 227650...",2016-03-01 07:36:00,2016-03-01 07:46:00,11
231,"(227322670.0, 227611930.0, 227639660.0, 227650...",2016-03-01 07:36:00,2016-03-01 07:46:00,11
237,"(227298110.0, 227369960.0, 227408710.0, 227574...",2016-03-01 07:36:00,2016-03-01 07:46:00,11
239,"(226263000.0, 227002330.0, 227008170.0, 227088...",2016-03-01 07:36:00,2016-03-01 07:46:00,11


In [164]:
pd.Series(new_clusters).apply(tstfunc, args=(past,))

0                             None
1                             None
2                             None
3                             None
4                             None
5                             None
6                             None
7                             None
8                             None
9                             None
10                            None
11                            None
12                            None
13                            None
14                            None
15                            None
16                            None
17    ([2016-03-01 07:36:00], [1])
18                            None
19                            None
20    ([2016-03-01 07:36:00], [1])
21    ([2016-03-01 07:36:00], [1])
22                            None
23                            None
24                            None
dtype: object

In [213]:
present

Unnamed: 0,clusters,st,et,dur
0,"(226263000.0, 227002330.0, 227006750.0, 227008...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
1,"(227002330.0, 227006750.0, 227016100.0, 227222...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
2,"(227322670.0, 227578460.0, 227611930.0, 227639...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
3,"(227578460.0, 227611930.0, 227631450.0, 227639...",2016-03-01 07:37:00,2016-03-01 07:37:00,1
4,"(227322670.0, 227639660.0, 227686540.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
5,"(227142200.0, 227162950.0, 227941000.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
6,"(227640710.0, 227654220.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
7,"(227315110.0, 227666970.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
8,"(227641920.0, 227686540.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1
9,"(227588970.0, 227590030.0)",2016-03-01 07:37:00,2016-03-01 07:37:00,1


In [227]:
present = present.apply(replace_with_existing_flocks, args=(new_clusters,past,),axis=1)

clusters    (227142200.0, 227162950.0, 227941000.0)
st                              2016-03-01 07:38:00
et                              2016-03-01 07:38:00
dur                                               1
Name: 0, dtype: object
clusters    (227142200.0, 227162950.0, 227941000.0)
st                              2016-03-01 07:38:00
et                              2016-03-01 07:38:00
dur                                               1
Name: 0, dtype: object
clusters    (227303430.0, 227632830.0)
st                 2016-03-01 07:38:00
et                 2016-03-01 07:38:00
dur                                  1
Name: 1, dtype: object
clusters    (227003050.0, 228064900.0)
st                 2016-03-01 07:38:00
et                 2016-03-01 07:38:00
dur                                  1
Name: 2, dtype: object
clusters    (227640710.0, 227654220.0)
st                 2016-03-01 07:38:00
et                 2016-03-01 07:38:00
dur                                  1
Name: 3, dtype: object
c

In [228]:
present

Unnamed: 0,clusters,st,et,dur
0,"(227142200.0, 227162950.0, 227941000.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
1,"(227303430.0, 227632830.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
2,"(227003050.0, 228064900.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
3,"(227640710.0, 227654220.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
4,"(227322670.0, 227639660.0, 227650230.0, 227686...",2016-03-01 07:38:00,2016-03-01 07:38:00,1
5,"(227322670.0, 227578460.0, 227611930.0, 227631...",2016-03-01 07:38:00,2016-03-01 07:38:00,1
6,"(249297000.0, 477115900.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
7,"(227322670.0, 227589520.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
8,"(228064900.0, 228186700.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
9,"(227315110.0, 227666970.0)",2016-03-01 07:38:00,2016-03-01 07:38:00,1
