In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
from random import choice
pd.options.mode.chained_assignment = None 
import configparser
import pickle
import psycopg2
import psycopg2.extras
import contextily as ctx

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.expanduser('~'), 'Documents/Coding/Python'))
# sys.path

from lonelyboy.geospatial import plots as gsplt
from lonelyboy.geospatial import preprocessing as gspp
from lonelyboy.timeseries import lbtimeseries as tspp
# from lonelyboy.geospatial import group_patterns as gsgp


# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
import PyQt5
import matplotlib.pyplot as plt
from matplotlib import style;  style.use('ggplot')

style.use('ggplot')

get_ipython().magic('matplotlib qt')
# get_ipython().magic('matplotlib inline')

In [3]:
PLT_IMAGE_WIDTH = 3.748
PLT_IMAGE_HEIGHT = PLT_IMAGE_WIDTH/1.618

plt.rc('text', usetex=True)
plt.rc('font', family='sans-serif', size=8)
plt.rcParams['figure.figsize'] = (PLT_IMAGE_WIDTH, PLT_IMAGE_HEIGHT)

In [4]:
def tuple_str_to_tuple(_str_):
    return tuple(map(int, _str_[1:-1].split(',')))

def tuple_str_to_list(_str_):
    return list(map(int, _str_[1:-1].split(',')))

# Reading Data (mode: Flocks, card:3, dt:3, dist:2778)

In [5]:
df = pd.read_csv('./data/csv/flocks_card_5_dt_10_dist_2778.csv')
print(len(df))
df.head()

12911


Unnamed: 0,clusters,st,et,dur
0,"(227574020, 227592820, 227612860, 227635210, 2...",2015-10-01 06:21:00,2015-10-01 06:43:00,23
1,"(227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 07:41:00,2015-10-01 07:51:00,11
2,"(227574020, 227612860, 227635210, 227705102, 2...",2015-10-01 07:41:00,2015-10-01 07:53:00,13
3,"(227005550, 227574020, 227592820, 227612860, 2...",2015-10-01 07:28:00,2015-10-01 07:57:00,30
4,"(227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 08:08:00,2015-10-01 08:22:00,15


In [6]:
df.loc[:,'clusters'] = df.clusters.apply(tuple_str_to_list)
df.head()

Unnamed: 0,clusters,st,et,dur
0,"[227574020, 227592820, 227612860, 227635210, 2...",2015-10-01 06:21:00,2015-10-01 06:43:00,23
1,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 07:41:00,2015-10-01 07:51:00,11
2,"[227574020, 227612860, 227635210, 227705102, 2...",2015-10-01 07:41:00,2015-10-01 07:53:00,13
3,"[227005550, 227574020, 227592820, 227612860, 2...",2015-10-01 07:28:00,2015-10-01 07:57:00,30
4,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 08:08:00,2015-10-01 08:22:00,15


# **#4** Trip Contribution per Group Pattern (Flocks)

In [7]:
%%time
properties = configparser.ConfigParser()
properties.read(os.path.join('.','sql_server.ini'))
properties = properties['SERVER']

host    = properties['host']
db_name = properties['db_name']
uname   = properties['uname']
pw      = properties['pw']
port    = properties['port']

con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port=port)
query = 'SELECT * FROM ais_data.dynamic_ships_min_trip_card_3_segmented_12h_resampled_1min_v2 WHERE mmsi IN %s AND datetime BETWEEN \'%s\' AND \'%s\';'

Wall time: 165 ms


In [8]:
def classify_trips(df):
    df2 = pd.Series([0], index=['class'])
    
    if df['label'].iloc[0] == -1 and df['label'].iloc[-1] == -1:
        df2['class'] = 1
    elif df['label'].iloc[0] == -1 and df['label'].iloc[-1] == 0:
        df2['class'] += 2
    elif df['label'].iloc[0] == 0 and df['label'].iloc[-1] == -1:
        df2['class'] += 3
    elif df['label'].iloc[0] == 0 and df['label'].iloc[-1] == 0:
        df2['class'] += 4
    
    return df2

In [9]:
from tqdm import tqdm

csv_dir = 'data/csv/nari_dynamic_min_trip_card_3_no_resampling_correcred_bug_V2'
# csv_dir = 'test_data/nari_dynamic_min_trip_card_3_no_resampling_correcred_bug_V2'
df_trips = []
for file in tqdm(os.listdir(csv_dir)):
    df_dynamic_trips = pd.read_csv(os.path.join(csv_dir, file))
    df_trip = df_dynamic_trips.groupby(['mmsi', 'trip_id'], group_keys=False).apply(lambda df_dt: classify_trips(df_dt)).reset_index()
    df_trips.append(df_trip)

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:30<00:00,  5.34s/it]


In [10]:
CLASSIFIED_TRIPS = pd.concat(df_trips)

In [11]:
df

Unnamed: 0,clusters,st,et,dur
0,"[227574020, 227592820, 227612860, 227635210, 2...",2015-10-01 06:21:00,2015-10-01 06:43:00,23
1,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 07:41:00,2015-10-01 07:51:00,11
2,"[227574020, 227612860, 227635210, 227705102, 2...",2015-10-01 07:41:00,2015-10-01 07:53:00,13
3,"[227005550, 227574020, 227592820, 227612860, 2...",2015-10-01 07:28:00,2015-10-01 07:57:00,30
4,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 08:08:00,2015-10-01 08:22:00,15
5,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 08:43:00,2015-10-01 08:53:00,11
6,"[227574020, 227612860, 227635210, 227705102, 2...",2015-10-01 08:08:00,2015-10-01 08:53:00,46
7,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 07:28:00,2015-10-01 09:11:00,104
8,"[227005550, 227519920, 227574020, 227612860, 2...",2015-10-01 09:11:00,2015-10-01 09:24:00,14
9,"[227005550, 227574020, 227612860, 227635210, 2...",2015-10-01 10:15:00,2015-10-01 10:26:00,12


In [12]:
CLASSIFIED_TRIPS.head()

Unnamed: 0,mmsi,trip_id,class
0,205688000,0,4
1,207138000,0,1
2,207138000,1,2
3,207138000,2,4
4,209366000,0,4


In [15]:
import multiprocessing
from tqdm import tqdm_notebook


def get_gp_trip_contributions_parallel(y):
    print ('Connecting to Database...')
    con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port=port)
    print ('Connected to Database!')
    
    df_stat4 = []
    for row in tqdm_notebook(y.itertuples(), total=len(y)):
        df_stat4_row = pd.DataFrame([{'GP':row.clusters, 'C1':0, 'C2':0, 'C3':0, 'C4':0}], columns=['GP', 'C1', 'C2', 'C3', 'C4'])

        row_dynamic = pd.read_sql_query(query%(tuple(row.clusters), row.st, row.et), con=con)
        row_dynamic_trips = row_dynamic.groupby(['mmsi', 'trip_id']).apply(lambda x: CLASSIFIED_TRIPS.loc[(CLASSIFIED_TRIPS.mmsi == x.name[0]) & (CLASSIFIED_TRIPS.trip_id == x.name[1])]['class'].values).to_frame()

        for trip_contr in row_dynamic_trips[0].value_counts().iteritems():
            df_stat4_row.iloc[0, trip_contr[0]] = trip_contr[1]

        df_stat4.append(df_stat4_row)
    con.close()
    return df_stat4


def parallelize_dataframe(df_par, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    
    df_split = np.array_split(df_par, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    
    df_res = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df_res

In [None]:
parallelize_dataframe(df.iloc[0:10], get_gp_trip_contributions)

# **TESTING**: Getting the fisheries, then filtering by the cardinality threshold

In [None]:
min_cardinality = 3

flocks_of_fisheries_tst = df.copy()
flocks_of_fisheries_tst.loc[:,'clusters'] = flocks_of_fisheries_tst.apply(lambda x: np.array(tuple_str_to_list(x.clusters))[np.isin(tuple_str_to_list(x.clusters), mmsis_fisheries)], axis=1)
flocks_of_fisheries_tst = flocks_of_fisheries_tst.loc[flocks_of_fisheries_tst.clusters.apply(len) > min_cardinality]

In [None]:
len(flocks_of_fisheries_tst)

# **TESTING**: Statistics #3

In [None]:
from tqdm import tqdm_notebook

def get_gp_avg_velocity_single(x):
    con = psycopg2.connect(database=db_name, user=uname, password=pw, host=host, port=port)
    gp = pd.read_sql_query(query%(tuple(x.clusters), x.st, x.et), con=con)
    con.close()
    return gp.groupby('mmsi', group_keys=False).apply(lambda df: df.velocity.mean()).mean()

for i, row in tqdm_notebook(enumerate(df.iloc[0:10].itertuples()), total=len(df)):
    df.loc[i, 'Avg. Speed'] = get_gp_avg_velocity_single(row)

# **TESTING**: Statistics #4

In [None]:
df_stat4 = []

for row in tqdm(df.itertuples()):
    df_stat4_row = pd.DataFrame([{'GP':row.clusters, 'C1':0, 'C2':0, 'C3':0, 'C4':0}], columns=['GP', 'C1', 'C2', 'C3', 'C4'])
    
    row_dynamic = pd.read_sql_query(query%(tuple(row.clusters), row.st, row.et), con=con)
    row_dynamic_trips = row_dynamic.groupby(['mmsi']).apply(count_trips).to_frame()
       
    for trip_contr in row_dynamic_trips[0].value_counts().iteritems():
        df_stat4_row.iloc[0, trip_contr[0]] = trip_contr[1]
        
    if row.Index == 0:
        break