In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
from random import choice
pd.options.mode.chained_assignment = None 
import configparser
import pickle
import psycopg2
import psycopg2.extras
import contextily as ctx

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.expanduser('~'), 'Documents/Insert-Generic-Name-Here'))
# sys.path

from lonelyboy.geospatial import plots as gsplt
from lonelyboy.geospatial import preprocessing as gspp
from lonelyboy.timeseries import lbtimeseries as tspp
# from lonelyboy.geospatial import group_patterns as gsgp


# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
import PyQt5
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')

style.use('ggplot')

get_ipython().magic('matplotlib qt')
# get_ipython().magic('matplotlib inline')

In [3]:
PLT_IMAGE_WIDTH = 3.748
PLT_IMAGE_HEIGHT = PLT_IMAGE_WIDTH/1.618

plt.rc('text', usetex=True)
plt.rc('font', family='sans-serif', size=8)
plt.rcParams['figure.figsize'] = (PLT_IMAGE_WIDTH, PLT_IMAGE_HEIGHT)

In [4]:
def tuple_str_to_tuple(_str_):
    return tuple(map(int, _str_[1:-1].split(',')))

def tuple_str_to_list(_str_):
    return list(map(int, _str_[1:-1].split(',')))

# Reading Data (mode: Flocks, card:3, dt:3, dist:2778)

In [15]:
df = pd.read_csv('./data/csv/flocks_card_5_dt_10_dist_2778.csv')
print(len(df))
df.head()

12911


Unnamed: 0,clusters,st,et,dur
0,"(227574020, 227592820, 227612860, 227635210, 2...",2015-10-01 06:21:00,2015-10-01 06:43:00,23
1,"(227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 07:41:00,2015-10-01 07:51:00,11
2,"(227574020, 227612860, 227635210, 227705102, 2...",2015-10-01 07:41:00,2015-10-01 07:53:00,13
3,"(227005550, 227574020, 227592820, 227612860, 2...",2015-10-01 07:28:00,2015-10-01 07:57:00,30
4,"(227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 08:08:00,2015-10-01 08:22:00,15


In [None]:
a = np.array([227142200, 228160000, 228167900])
a[[True, False, True]]

In [None]:
mmsis_fisheries = np.load('./data/pkl/fisheries_mmsis.npy')
mmsis_cargos = np.load('./data/pkl/cargos_mmsis.npy')

In [None]:
ports = pd.read_pickle('./data/pkl/ports_raw.pkl')
ports = ports.loc[ports.libelle_po.isin(['Brest', 'Douarnenez'])]

# Getting the flocks that have **ONLY** fisheries

In [None]:
flocks_of_fisheries_indices = df.apply(lambda x: np.isin(tuple_str_to_list(x.clusters), mmsis_fisheries).all(), axis=1)
flocks_of_cargos_indices = df.apply(lambda x: np.isin(tuple_str_to_list(x.clusters), mmsis_cargos).all(), axis=1)

In [None]:
flocks_of_fisheries = df[flocks_of_fisheries_indices]
flocks_of_fisheries.loc[:, 'clusters'] = flocks_of_fisheries.apply(lambda x: tuple_str_to_list(x.clusters), axis=1)
# flocks_of_fisheries

In [None]:
len(flocks_of_fisheries) # Before Thorough Filtering

In [None]:
flocks_of_cargos = df[flocks_of_cargos_indices]
flocks_of_cargos.loc[:, 'clusters'] = flocks_of_cargos.apply(lambda x: tuple_str_to_list(x.clusters), axis=1)
# flocks_of_cargos

In [None]:
len(flocks_of_cargos) # Before Thorough Filtering

In [None]:
flocks_of_cargos.reset_index(drop=True)

In [None]:
flocks_of_fisheries.reset_index(drop=True)

# Now comes the interesting part... finding a good flock to plot

In [75]:
properties = configparser.ConfigParser()
properties.read(os.path.join('.','sql_server.ini'))
properties = properties['SERVER']

host    = properties['host']
db_name = properties['db_name']
uname   = properties['uname']
pw      = properties['pw']
port    = properties['port']
query = "select * from ais_data.dynamic_ships_min_trip_card_3_segmented_12h_resampled_1min_v2 where mmsi in %s and datetime between '%s' and '%s';"

In [None]:
gp_idx = 16787 # best for fisheries so far
# gp_idx = 26

gp_cluster = flocks_of_fisheries.iloc[gp_idx, 0]
gp_st = flocks_of_fisheries.iloc[gp_idx, 1]
gp_et = flocks_of_fisheries.iloc[gp_idx, 2]

traj = pd.read_sql_query(query%(tuple(gp_cluster), gp_st, gp_et), con=con)
traj = gspp.gdf_from_df(traj)
    
df1 = traj.copy()
df2 = ports.copy()

df1.crs = {'init': 'epsg:4326'}
ax = df1.to_crs(epsg=3857).plot(figsize=(PLT_FIG_WIDTH, PLT_FIG_HEIGHT), column='mmsi', cmap='tab20')

df2.crs = {'init': 'epsg:4326'}
df2.to_crs(epsg=3857).plot(figsize=(PLT_FIG_WIDTH, PLT_FIG_HEIGHT), color='black', ax=ax)

ctx.add_basemap(ax, attribution='')
ax.margins(0)
ax.tick_params(left=False, labelleft=False, bottom=False, labelbottom=False)
plt.title(f'Vessels: {gp_cluster},\nStart: {gp_st}, End:{gp_et}', size=8)

# **#1** Average Duration per Group Pattern

In [6]:
df.loc[:, 'duration'] = (df.et.apply(pd.to_datetime, 's') - df.st.apply(pd.to_datetime, 's')).apply(lambda x: x.seconds//60)
df.head()

Unnamed: 0,clusters,st,et,dur,duration
0,"(227574020, 227592820, 227612860, 227635210, 2...",2015-10-01 06:21:00,2015-10-01 06:43:00,23,22
1,"(227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 07:41:00,2015-10-01 07:51:00,11,10
2,"(227574020, 227612860, 227635210, 227705102, 2...",2015-10-01 07:41:00,2015-10-01 07:53:00,13,12
3,"(227005550, 227574020, 227592820, 227612860, 2...",2015-10-01 07:28:00,2015-10-01 07:57:00,30,29
4,"(227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 08:08:00,2015-10-01 08:22:00,15,14


In [7]:
df['duration'].describe()

count    12911.000000
mean        39.217489
std         64.344426
min         10.000000
25%         13.000000
50%         19.000000
75%         38.000000
max       1432.000000
Name: duration, dtype: float64

In [9]:
out = pd.cut(df['duration'], [df['duration'].min(), 16, 32, 64, 128, 256, 512, 1024, df['duration'].max()])

ax = out.value_counts(sort=False).plot.bar(figsize=(PLT_IMAGE_WIDTH, PLT_IMAGE_HEIGHT), fontsize=8, width=0.75, cmap='tab20', rot=35)

# plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
plt.suptitle(r'\textbf{Average Duration per Group Pattern (Flocks)}', fontsize=8, y=1)
plt.xlabel(r'\textbf{Duration (minutes)}', fontsize=8)
plt.ylabel(r'\textbf{\#Patterns}', fontsize=8)

plt.savefig(os.path.join('..', 'AvgDurationPerGroupPattern_Flocks_NoSciNotation.pdf'), dpi=300, bbox_inches='tight')
# plt.savefig(os.path.join('..', 'AvgDurationPerGroupPattern_Flocks.pdf'), dpi=300, bbox_inches='tight')

# **#2** Distribution of Group Patterns Size (Flocks)

In [67]:
cluster_size = df.clusters.apply(tuple_str_to_list).apply(len)
cluster_size = cluster_size.value_counts()
cluster_size = cluster_size.to_frame()
# cluster_size

In [68]:
PLT_IMAGE_WIDTH_PC = 4.748
PLT_IMAGE_HEIGHT_PC = PLT_IMAGE_WIDTH_PC/1.618

df_piechart = pd.DataFrame({'#Clusters':cluster_size['clusters'].values},
                           index=pd.Series(cluster_size.index.values).apply(lambda x: f'{x} Vessels'))
percent = 100.*df_piechart['#Clusters']/df_piechart['#Clusters'].sum()

ax = df_piechart.plot.pie(y='#Clusters', 
                   figsize=(PLT_IMAGE_WIDTH_PC, PLT_IMAGE_HEIGHT_PC),
                   startangle=0,
                   autopct='', 
                   pctdistance=0.80,
                   cmap='tab20',
                   legend=False)
ax.axis('equal')

labels = ['{0}\t-\t{1:1.3f}\%%'.format(i,j) for i,j in zip(df_piechart.index, percent)]

plt.legend(ax.patches, labels, loc='center right', 
           bbox_to_anchor=(0.9,0.5), fontsize=8, bbox_transform=plt.gcf().transFigure)
# plt.legend(ax.patches, labels, loc='center right', 
#            bbox_to_anchor=(1,0.5), fontsize=8, bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0., bottom=0.1, right=0.75)


for text in ax.texts:
    text.set_fontsize(8)
    if ' ' in text.get_text():
        text.set_color('white')

plt.ylabel('')
plt.title(r'\textbf{Distribution of Group Patterns (Flocks) Size}', fontsize=8)
plt.savefig('../DistributionOfGroupPatternsSize_Flocks_V2.pdf', dpi=300, bbox_inches='tight')

In [None]:
# df_piechart = pd.DataFrame({'#Clusters':cluster_size['clusters'].values},
#                  index=pd.Series(cluster_size.index.values).apply(lambda x: f'{x} Vessels'))
# # print (df_piechart)

# ax = df_piechart.plot.pie(y='#Clusters', 
#                    figsize=(PLT_FIG_WIDTH, PLT_FIG_HEIGHT),
#                    startangle=0,
#                    autopct='%1.1f%%', 
#                    pctdistance=0.80,
#                    cmap='tab20',
#                    legend=True)
# ax.axis('equal')

# for text in ax.texts:
#     text.set_fontsize(8)
#     if ' ' in text.get_text():
#         text.set_color('grey')

# plt.ylabel('')
# plt.savefig('../DistributionOfGroupPatternsSize2.pdf', dpi=300, bbox_inches='tight')

In [70]:
df_barchart = pd.DataFrame({r'\#Clusters':cluster_size['clusters']},
                 index=cluster_size.index)

df_barchart.plot.bar(rot=0, legend=False, cmap='tab20c')
# plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
plt.xlabel(r'\textbf{\#Vessels}')
plt.ylabel(r'\textbf{\#Group Patterns}')
plt.suptitle(r'\textbf{Group Pattern (Flocks) Size Distribution}', y=1)

# plt.savefig('../DistributionOfGroupPatternsSize3.pdf', dpi=300, bbox_inches='tight')
plt.savefig('../DistributionOfGroupPatternsSize3_NoSciNotation.pdf', dpi=300, bbox_inches='tight')

# **#3** Avg Velocity per Group Pattern (Flocks)

In [17]:
df.loc[:,'clusters'] = df.clusters.apply(tuple_str_to_list)

In [19]:
df.head()

Unnamed: 0,clusters,st,et,dur
0,"[227574020, 227592820, 227612860, 227635210, 2...",2015-10-01 06:21:00,2015-10-01 06:43:00,23
1,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 07:41:00,2015-10-01 07:51:00,11
2,"[227574020, 227612860, 227635210, 227705102, 2...",2015-10-01 07:41:00,2015-10-01 07:53:00,13
3,"[227005550, 227574020, 227592820, 227612860, 2...",2015-10-01 07:28:00,2015-10-01 07:57:00,30
4,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 08:08:00,2015-10-01 08:22:00,15


In [36]:
from tqdm import tqdm_notebook
import multiprocessing

def get_gp_avg_velocity(x, conn):
    gp = pd.read_sql_query(query%(tuple(x.clusters), x.st, x.et), con=conn)
    return gp.groupby('mmsi', group_keys=False).apply(lambda df: df.velocity.mean()).mean()

def get_gp_avg_velocity_parallel(y):
    print ('Connecting to Database...')
    con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port=port)
    print ('Connected to Database!')
    for row in tqdm_notebook(y.itertuples(), total=len(y)):
        y.loc[row.Index, 'Avg. Speed'] = get_gp_avg_velocity(row, con)
    con.close()
    return y
        
def parallelize_dataframe(df_par, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    
    df_res = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df_res

In [77]:
df2 = parallelize_dataframe(df, get_gp_avg_velocity_parallel)

Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!


HBox(children=(IntProgress(value=0, max=1845), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1844), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1845), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1845), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1844), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1844), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1844), HTML(value='')))










In [79]:
df2['Avg. Speed'].describe()

count    12911.000000
mean         2.203140
std          2.053340
min          0.000896
25%          0.660857
50%          1.544365
75%          3.297005
max         13.363073
Name: Avg. Speed, dtype: float64

In [81]:
out = pd.cut(df2['Avg. Speed'], [0, 2, 4, 8, df2['Avg. Speed'].max()])

ax = out.value_counts(sort=False).plot.bar(figsize=(PLT_IMAGE_WIDTH, PLT_IMAGE_HEIGHT), fontsize=8, width=0.75, cmap='tab20', rot=35)

plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
plt.suptitle(r'\textbf{Average Group Pattern (Flocks) Speed}', fontsize=8, y=1)
plt.xlabel(r'\textbf{Speed (knots)}', fontsize=8)
plt.ylabel(r'\textbf{\#Patterns}', fontsize=8)

plt.savefig(os.path.join('..', 'AvgGroupPatternVelocity_Flocks.pdf'), dpi=300, bbox_inches='tight')
# plt.savefig(os.path.join('..', 'AvgGroupPatternVelocity_Flocks_NoSciNotation.pdf'), dpi=300, bbox_inches='tight')

# Saving Results

In [78]:
df2.to_csv('./data/csv/stats/flocks_card_5_dt_10_dist_2778_WITH_AVERAGE_SPEED.csv')

# **#3** Total Distance Travelled per Group Pattern (Flocks)

In [37]:
def get_gp_travelled_distance(x, conn):
    gp = pd.read_sql_query(query%(tuple(x.clusters), x.st, x.et), con=conn)
    center_point = gp.groupby('datetime', group_keys=False, as_index=False).apply(lambda df: pd.Series({'mean_lon': df.lon.mean(), 'mean_lat': df.lat.mean()}))
    
    current_loc = center_point[['mean_lon', 'mean_lat']]
    next_loc = center_point[['mean_lon', 'mean_lat']].shift(-1)
    return np.nansum([gspp.haversine(tuple(curr_loc), tuple(nxt_loc))  for curr_loc, nxt_loc in zip(current_loc.values.tolist(), next_loc.values.tolist())])
#     center_point.loc[:, 'distance'] = center_point[['mean_lon', 'mean_lat']].rolling(2, axis=1).apply(lambda x : gspp.haversine(tuple(x[0,:].flatten().tolist()), tuple(x[1,:].flatten().tolist()))*0.539956803).cumsum()
                                                                  

def get_gp_travelled_distance_parallel(y):
    print ('Connecting to Database...')
    con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port=port)
    print ('Connected to Database!')
    for row in tqdm_notebook(y.itertuples(), total=len(y)):
        y.loc[row.Index, 'distance'] = get_gp_travelled_distance(row, con)
    con.close()
    return y

In [87]:
df2 = parallelize_dataframe(df, get_gp_travelled_distance_parallel)   

Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connecting to Database...
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!
Connected to Database!


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))










# TESTING **#4** Trip Contribution per Group Pattern (Flocks)

In [38]:
def classify_trips(df):
    df2 = pd.Series([0], index=['class'])
    
    if df['label'].iloc[0] == -1 and df['label'].iloc[-1] == -1:
        df2['class'] = 1
    elif df['label'].iloc[0] == -1 and df['label'].iloc[-1] == 0:
        df2['class'] += 2
    elif df['label'].iloc[0] == 0 and df['label'].iloc[-1] == -1:
        df2['class'] += 3
    elif df['label'].iloc[0] == 0 and df['label'].iloc[-1] == 0:
        df2['class'] += 4
    
    return df2

In [98]:
from tqdm import tqdm_notebook as tqdm
csv_dir = 'data/csv/nari_dynamic_min_trip_card_3_no_resampling_correcred_bug_V2'
# csv_dir = 'test_data/nari_dynamic_min_trip_card_3_no_resampling_correcred_bug_V2'
df_trips = []
for file in tqdm(os.listdir(csv_dir)):
    df_dynamic_trips = pd.read_csv(os.path.join(csv_dir, file))
    df_trip = df_dynamic_trips.groupby(['mmsi', 'trip_id'], group_keys=False).apply(lambda df_dt: classify_trips(df_dt)).reset_index()
    df_trips.append(df_trip)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

In [101]:
len(CLASSIFIED_TRIPS.mmsi.unique())

3279

In [99]:
CLASSIFIED_TRIPS = pd.concat(df_trips)

In [100]:
CLASSIFIED_TRIPS.head(52)

Unnamed: 0,mmsi,trip_id,class
0,205204000,0,3
1,205204000,1,2
2,205204000,2,4
3,205204000,3,4
4,205204000,4,3
5,205204000,5,2
6,205204000,6,3
7,205204000,7,2
8,205204000,8,4
9,205204000,9,4


# **TESTING**: Getting the fisheries, then filtering by the cardinality threshold

In [None]:
min_cardinality = 3

flocks_of_fisheries_tst = df.copy()
flocks_of_fisheries_tst.loc[:,'clusters'] = flocks_of_fisheries_tst.apply(lambda x: np.array(tuple_str_to_list(x.clusters))[np.isin(tuple_str_to_list(x.clusters), mmsis_fisheries)], axis=1)
flocks_of_fisheries_tst = flocks_of_fisheries_tst.loc[flocks_of_fisheries_tst.clusters.apply(len) > min_cardinality]

In [None]:
len(flocks_of_fisheries_tst)

# **TESTING**: Statistics #3

In [None]:
from tqdm import tqdm_notebook

def get_gp_avg_velocity_single(x):
    con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port=port)
    gp = pd.read_sql_query(query%(tuple(x.clusters), x.st, x.et), con=con)
    con.close()
    return gp.groupby('mmsi', group_keys=False).apply(lambda df: df.velocity.mean()).mean()

for i, row in tqdm_notebook(enumerate(df.iloc[0:10].itertuples()), total=len(df)):
    df.loc[i, 'Avg. Speed'] = get_gp_avg_velocity_single(row)