In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
from random import choice
pd.options.mode.chained_assignment = None 
import configparser
import pickle
import psycopg2
import psycopg2.extras
import contextily as ctx
import multiprocessing as mp

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.expanduser('~'), 'Documents/Insert-Generic-Name-Here'))
# sys.path

from lonelyboy.geospatial import plots as gsplt
from lonelyboy.geospatial import preprocessing as gspp
from lonelyboy.timeseries import lbtimeseries as tspp
# from lonelyboy.geospatial import group_patterns as gsgp


# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
import PyQt5
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')

style.use('ggplot')

get_ipython().magic('matplotlib qt')
# get_ipython().magic('matplotlib inline')

In [3]:
PLT_IMAGE_WIDTH = 3.748
PLT_IMAGE_HEIGHT = PLT_IMAGE_WIDTH/1.618

plt.rc('text', usetex=True)
plt.rc('font', family='sans-serif', size=8)
plt.rcParams['figure.figsize'] = (PLT_IMAGE_WIDTH, PLT_IMAGE_HEIGHT)

# Reading the Vessels' MMSIs

In [4]:
CLUSTER_ID = 0

mmsi_list = pd.read_pickle(os.path.join('.', 'data/pkl/mmsi_list.pckl'))[CLUSTER_ID]
mmsi_list_window_size = mp.cpu_count()-1

In [5]:
properties = configparser.ConfigParser()
properties.read(os.path.join('.','sql_server.ini'))
properties = properties['SERVER']

host    = properties['host']
db_name = properties['db_name']
uname   = properties['uname']
pw      = properties['pw']
port    = properties['port']

query = "select * from ais_data.dynamic_ships_min_trip_card_3_segmented_12h_v2 where mmsi in %s;"
con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port=port)

In [17]:
con.close() # close the jupyter connection

In [6]:
cmap = plt.cm.tab20

In [6]:
from scipy.interpolate import interp1d

def resample_geospatial(df, features=['lat', 'lon'], rule='60S', method='linear', crs={'init': 'epsg:4326'}, drop_lon_lat=False):
    df['datetime'] = pd.to_datetime(df['ts'], unit='s')
    x = df['datetime'].values.astype(np.int64)
    y = df[features].values
    
    # scipy interpolate needs at least 2 records 
    if (len(df) <= 1):
        return df.iloc[0:0]
    
    f = interp1d(x, y, kind=method, axis=0)
    xnew_V2 = pd.date_range(start=df['datetime'].min().replace(second=0), end=df['datetime'].max().replace(second=0), freq=rule, closed='right')
    
    df_RESAMPLED = pd.DataFrame(f(xnew_V2), columns=features)      
    df_RESAMPLED.loc[:, 'datetime'] = xnew_V2
    
    if (len(df_RESAMPLED) == 0):
        df_RESAMPLED.insert(len(df_RESAMPLED.columns), 'geom', '')
    else:
        df_RESAMPLED.loc[:, 'geom'] = df_RESAMPLED[['lon', 'lat']].apply(lambda x: Point(x[0], x[1]), axis=1)

    #drop lat and lon if u like
    if drop_lon_lat:
        df_RESAMPLED = df_RESAMPLED.drop(['lat', 'lon'], axis=1)
        
    return gpd.GeoDataFrame(df_RESAMPLED, crs=crs, geometry='geom')

In [7]:
from tqdm import tqdm_notebook
from functools import partial

def __parallelize_resampling(x, features=['lat', 'lon'], rule='60S', method='linear', crs={'init': 'epsg:4326'}, drop_lon_lat=False):
    print (f'Resampling for mmsi:{x.mmsi.unique()[0]}')
    y = x.groupby(['trip_id'], group_keys=False, as_index=False).apply(gspp.resample_geospatial, features=features, rule=rule, method=method, crs=crs, drop_lon_lat=drop_lon_lat)
    print (f'Resampling for mmsi:{x.mmsi.unique()[0]} Complete')
    return y
        
def parallelize_resampling(df, features=['lat', 'lon'], rule='60S', method='linear', crs={'init': 'epsg:4326'}, drop_lon_lat=False):
    num_cores = mp.cpu_count()-1  #leave one free to not freeze machine
    df_split = [df.loc[df.mmsi==mmsi] for mmsi in df.mmsi.unique()]
    
    func = partial(__parallelize_resampling, features=features, rule=rule, method=method, crs=crs, drop_lon_lat=drop_lon_lat)
    pool = mp.Pool(num_cores)
    
    res = pd.concat(pool.map(func, df_split))
    print (f'Resampling Complete!')
    pool.close()
    pool.join()
    return res

In [None]:
for i in range(0, len(mmsi_list), mmsi_list_window_size):
    print ('Fetching Data....')
    mmsis = mmsi_list[i:i+mmsi_list_window_size]
    dfTmp = pd.read_sql_query(query%(tuple(mmsis),), con=con)
    
    df2 = parallelize_resampling(dfTmp, features=dfTmp.columns, rule=f'{1*60}S')
    if (i==14):
        break

In [9]:
dfTmp

Unnamed: 0,id,mmsi,turn,speed,velocity,course,heading,lon,lat,ts,traj_id,trip_id,port_label
0,7131782,227630560,0.0,5.8,5.414451,8.5,511,-4.767583,48.057550,1445168014,0,0,0
1,7131784,227630560,0.0,5.7,5.703285,6.3,511,-4.766298,48.064980,1445168314,0,0,0
2,7131785,227630560,0.0,5.7,6.820141,7.8,511,-4.763453,48.080635,1445168915,0,0,0
3,7131786,227630560,0.0,7.0,6.154162,7.0,511,-4.759860,48.099255,1445169514,0,0,0
4,7131787,227630560,0.0,3.5,3.559788,16.2,511,-4.740038,48.155860,1445171614,0,0,0
5,7131789,227630560,0.0,4.1,4.426059,5.8,511,-4.738837,48.160686,1445171915,0,0,0
6,7131790,227630560,0.0,4.1,4.053049,7.8,511,-4.738730,48.161293,1445171945,0,0,0
7,7131791,227630560,0.0,4.0,3.851216,0.2,511,-4.738472,48.163517,1445172064,0,0,0
8,7131792,227630560,0.0,3.4,3.156495,351.1,511,-4.739541,48.170967,1445172485,0,0,0
9,7131794,227630560,0.0,2.8,3.011629,350.2,511,-4.739618,48.171370,1445172513,0,0,0


# A solution to fix the continuous values on the labels (discuss it with ytheod)

In [14]:
np.unique(np.around(df2.port_label.values))

array([-1.,  0.])

In [17]:
df2

Unnamed: 0,id,mmsi,turn,speed,velocity,course,heading,lon,lat,ts,traj_id,trip_id,port_label,datetime,geom
0,1.436030e+06,226084000.0,-127.0,9.178571,12.228184,231.264286,511.0,-4.820873,48.343144,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:41:00,POINT (-4.820873214285714 48.34314435714286)
1,1.436031e+06,226084000.0,-127.0,9.157143,12.253590,232.828571,511.0,-4.823921,48.341662,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:42:00,POINT (-4.823921428571428 48.34166171428571)
2,1.436031e+06,226084000.0,-127.0,9.135714,12.278995,234.392857,511.0,-4.826970,48.340179,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:43:00,POINT (-4.826969642857143 48.34017907142857)
3,1.436032e+06,226084000.0,-127.0,9.114286,12.304401,235.957143,511.0,-4.830018,48.338696,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:44:00,POINT (-4.830017857142857 48.33869642857143)
4,1.436033e+06,226084000.0,-127.0,9.050000,12.192342,237.700000,511.0,-4.833096,48.337254,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:45:00,POINT (-4.8330965 48.33725399999999)
5,1.436037e+06,226084000.0,-127.0,8.804878,12.087893,235.180488,511.0,-4.836182,48.335888,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:46:00,POINT (-4.836182024390244 48.33588780487805)
6,1.436039e+06,226084000.0,-127.0,8.800000,11.748552,231.765217,511.0,-4.839215,48.334446,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:47:00,POINT (-4.83921463768116 48.33444643478261)
7,1.436048e+06,226084000.0,-127.0,8.895238,12.531323,235.390476,511.0,-4.842078,48.332955,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:48:00,POINT (-4.842078014285714 48.33295495238095)
8,1.436048e+06,226084000.0,-127.0,8.800562,12.574911,239.478652,511.0,-4.845425,48.331985,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:49:00,POINT (-4.845424867977528 48.33198533707865)
9,1.436049e+06,226084000.0,-127.0,8.699438,12.585806,243.321348,511.0,-4.848779,48.331025,1.443833e+09,0.0,0.0,0.0,2015-10-03 00:50:00,POINT (-4.848778632022472 48.33102466292135)


In [None]:
gsplt.map_plot(df2.loc[(df2.mmsi==228037600) & (df2.trip_id == 7)], color=[cmap(0)])

In [None]:
tmp = df.loc[(df.mmsi==228037600) & (df.trip_id == 7)]
gsplt.map_plot(gspp.gdf_from_df(tmp), color=[cmap(0)])