In [None]:
!pip install geopandas==0.12.1;
!pip install rtree;
!pip install pyarrow==10.0.0 s3fs folium

In [None]:
!pip install pywaffle

In [1]:
import os
os.getcwd()

'/home/onyxia/work/hackathon-un-2022/notebooks'

In [2]:
os.chdir("..")
import utils.functions as fc

In [3]:
from importlib import reload
fc = reload(fc)

In [4]:
ship_data_enriched = fc.create_ship_data_enriched()

In [6]:
path_parquet = "AIS/ais_azov_black_20190401_20190408_full_traces_before.parquet"

ais_df = fc.read_ais_parquet(path_parquet=path_parquet)

In [7]:
ais_df.shape

(6943856, 6)

In [8]:
import pandas as pd

pd.options.display.max_columns = 30
ais_df.head()

Unnamed: 0,mmsi,latitude,longitude,eeid,dt_insert_utc,destination
0,477850300,-12.806792,6.93,,2019-03-17 07:18:47,DILISJG
1,271045759,-33.905423,31.418842,,2019-03-17 08:42:39,
2,111111111,24.722957,118.976562,,2019-03-17 07:15:25,16
3,111111111,24.711802,119.009352,,2019-03-17 07:27:53,16
4,111111111,24.416592,118.05019,,2019-03-17 02:57:34,


In [9]:
ais_df.columns

Index(['mmsi', 'latitude', 'longitude', 'eeid', 'dt_insert_utc',
       'destination'],
      dtype='object')

In [10]:
import pandas as pd
import rtree
import geopandas as gpd
from shapely.geometry import Point, mapping, Polygon
import re
import folium
import numpy as np
from datetime import datetime
from pyarrow import fs
import pyarrow as pa
import pyarrow.parquet as pq

## Processing port data

In [None]:
df = pd.read_csv('https://msi.nga.mil/api/publications/download?type=view&key=16920959/SFH00000/UpdatedPub150.csv')

df = df[['World Port Index Number', 'Region Name', 'Main Port Name', 'World Water Body', 'Country Code', 'Latitude', 'Longitude']].rename(
    columns={
        'World Port Index Number': 'id',
        'Region Name': 'region_name',
        'Main Port Name': 'port_name',
        'World Water Body': 'water_body',
        'Country Code': 'country',
        'Latitude': 'lat',
        'Longitude':'long'
    }
)

In [None]:
df['coordinates'] = [Point(xy) for xy in zip(df.long, df.lat)] 

In [None]:
# Note cap_style: round = 1, flat = 2, square = 3
df['port_coverage'] = gpd.GeoSeries(df['coordinates']).buffer(0.04, cap_style = 3)

Creating buffer.

In [None]:
df['port_buffer'] = gpd.GeoSeries(df['coordinates']).buffer(0.08, cap_style = 3)

In [None]:
df1 = gpd.GeoDataFrame({'geometry': df['port_coverage'], 'df1': df['id']})
df2 = gpd.GeoDataFrame({'geometry': df['port_buffer'], 'df2': df['id']})
res_difference = df2.overlay(df1, how='difference')
df['port_buffer'] = res_difference['geometry']

Filtering ports from the Azov and Black seas: now we're not doing this anymore

In [None]:
# df_black_azov = df[df['water_body'].str.contains('Black', case=False)]
df_black_azov = df

In [None]:
df_black_azov.shape

There are 49 ports listed for the Azov and the Black seas. This list might not complete though ? See for example Marioupol in the Azov sea which does have a port ?

In [None]:
df_black_azov.port_buffer.iloc[0]

In [None]:
df_black_azov.port_coverage.iloc[0]

Plotting port coverages and buffers

In [None]:
m = folium.Map(location=[43.433333, 39.933333])

for i in range(0, len(df_black_azov)):
    # Plot port coordinates
    folium.Marker([df_black_azov.iloc[i]['lat'], df_black_azov.iloc[i]['long']],
                  popup=df_black_azov.iloc[i]['port_name']).add_to(m)

    # Plot port coverage
    coverage = gpd.GeoSeries(df_black_azov.iloc[i]['port_coverage']).simplify(tolerance=0.001)
    geo_j = coverage.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'orange'})
    geo_j.add_to(m)
    
    # Plot port buffer
    buffer = gpd.GeoSeries(df_black_azov.iloc[i]['port_buffer']).simplify(tolerance=0.001)
    buffer_geo_j = buffer.to_json()
    buffer_geo_j = folium.GeoJson(data=buffer_geo_j,
                                  style_function=lambda x: {'fillColor': 'purple'})
    buffer_geo_j.add_to(m)

In [None]:
m

Cleaning ports

In [None]:
df_black_azov

## Preparing data for the `get_ais` function

In [None]:
df_black_azov.shape

In [None]:
df_black_azov[~df_black_azov.id.duplicated()].shape

In [None]:
df_black_azov[['id', 'port_coverage']].shape

In [None]:
from geopandas import gpd 

ports = gpd.GeoDataFrame(df_black_azov[['id', 'port_coverage']]).rename(
    columns={'port_coverage': 'polygon'}
)
ports['buffer'] = 0
port_buffers = gpd.GeoDataFrame(df_black_azov[['id', 'port_buffer']]).rename(
    columns={'port_buffer': 'polygon'}
)
port_buffers['buffer'] = 1
polys = gpd.GeoDataFrame(pd.concat([ports, port_buffers], ignore_index=True),
                         geometry='polygon')

In [None]:
polys[polys.id.duplicated()]

In [None]:
def filter_ports(df):
    """
    In original df there must be a latitude and longitude
    
    Returns df.
    """
    df['coordinates'] = [Point(xy) for xy in zip(df.longitude, df.latitude)] 
    points = gpd.GeoDataFrame(df, geometry='coordinates')
    points_in_polys = gpd.sjoin(points, polys, op='within')
    return points_in_polys

In [None]:
filtered_df = filter_ports(ais_df)

In [None]:
filtered_df.shape

In [None]:
filtered_df['polygon'] = [str(a) + '_' + str(b) for a, b in zip(filtered_df['id'], filtered_df['buffer'])]

In [None]:
filtered_df.head()

## Assign route

In [None]:
filtered_df = filtered_df.sort_values(by=['mmsi', 'dt_pos_utc', 'dt_static_utc'])
filtered_df

In [None]:
filtered_df['d_dt_pos'] = filtered_df['dt_pos_utc'] - filtered_df['dt_pos_utc'].shift(-1)
filtered_df['d_dt_static'] = filtered_df['dt_static_utc'] - filtered_df['dt_static_utc'].shift(-1)

In [None]:
polygon_mapping = {
    polygon: index for polygon, index in zip(filtered_df.polygon.unique(), np.arange(filtered_df.polygon.nunique()))
}

In [None]:
filtered_df['polygon'] = filtered_df['polygon'].map(polygon_mapping)
filtered_df.head()

In [None]:
from datetime import timedelta
THRESHOLD = timedelta(
    days=0,
    seconds=0,
    microseconds=0,
    milliseconds=0,
    minutes=0,
    hours=12,
    weeks=0
)
THRESHOLD

In [None]:
def assign_routes_to_group(x):
    polygon_diff = x['polygon'].diff().ne(0)
    time_diff = x['d_dt_pos'] < -1 * THRESHOLD
    buffer = x['buffer']
    return (polygon_diff | (time_diff & buffer)).cumsum()

In [None]:
filtered_df['route'] =  filtered_df.groupby('mmsi').apply(
    assign_routes_to_group
).reset_index(level=0, drop=True)

Checking when two consecutive obs. with equal pos datetime

In [None]:
pd.options.display.max_columns = None

In [None]:
filtered_df[filtered_df['d_dt_pos'] == timedelta(
    days=0,
    seconds=0,
    microseconds=0,
    milliseconds=0,
    minutes=0,
    hours=0,
    weeks=0
)].sort_values(by=['mmsi', 'dt_pos_utc', 'dt_static_utc'])

Happens quite rarely : polygons intersecting ?

## RESET : enchaînement des destinations ? 

In [11]:
ais_df.shape

(6943856, 6)

In [12]:
ais_df = ais_df.sort_values(by=['mmsi', 'dt_insert_utc'])

In [13]:
destination_mapping = {
    destination: index for destination, index in zip(ais_df.destination.unique(), np.arange(ais_df.destination.nunique()))
}

In [14]:
ais_df['destination_index'] = ais_df['destination'].map(destination_mapping)
ais_df['destination_index'] = ais_df['destination_index'].fillna(-1)
ais_df['destination_index'] = ais_df['destination_index'].astype(int)


In [15]:
def assign_routes_to_group(x):
    return x['destination_index'].diff().ne(0).cumsum()

In [16]:
ais_df['route'] =  ais_df.groupby('mmsi').apply(
    assign_routes_to_group
).reset_index(level=0, drop=True)

In [17]:
routes_df = ais_df.groupby(['mmsi', 'route', 'destination']).agg(
    {
        'dt_insert_utc': ['first', 'last'], 
        'latitude': ['first', 'last'], 
        'longitude': ['first', 'last'],
        'route': 'count'
    }
)

In [18]:
routes_df.columns = routes_df.columns.map('_'.join).str.strip('_')

In [19]:
routes_df.reset_index()

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count
0,2060,2,VIARREGIO,2019-03-20 06:19:28,2019-03-20 06:19:28,41.725052,41.725052,41.728248,41.728248,1
1,2060,4,SEATTLE,2019-03-23 10:53:19,2019-03-23 10:53:19,41.725050,41.725050,41.728232,41.728232,1
2,2060,6,VN CMP,2019-03-27 09:58:39,2019-03-27 09:58:39,41.725050,41.725050,41.728243,41.728243,1
3,2060,8,ANDERNACH,2019-03-30 12:28:28,2019-03-30 12:28:28,41.725052,41.725052,41.728248,41.728248,1
4,2078,2,USMSY>DOHAI,2019-03-20 10:37:45,2019-03-20 10:37:45,17.783928,17.783928,-70.517782,-70.517782,1
...,...,...,...,...,...,...,...,...,...,...
96872,926416334,27,NOVI-SAD,2019-03-27 15:35:10,2019-03-27 15:35:10,45.251007,45.251007,19.906517,19.906517,1
96873,926416334,28,CONSTANTA,2019-03-27 15:42:26,2019-03-28 18:29:12,45.250868,44.688962,19.906155,22.404160,206
96874,926416334,29,BOTANY BAY,2019-03-28 18:31:44,2019-03-28 18:31:44,44.693363,44.693363,22.407612,22.407612,1
96875,926416334,30,CONSTANTA,2019-03-28 18:41:12,2019-04-04 17:19:24,44.707773,44.112117,22.424920,28.643443,1158


In [20]:
routes_df.sort_values(['mmsi', 'route'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count
mmsi,route,destination,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2060,2,VIARREGIO,2019-03-20 06:19:28,2019-03-20 06:19:28,41.725052,41.725052,41.728248,41.728248,1
2060,4,SEATTLE,2019-03-23 10:53:19,2019-03-23 10:53:19,41.725050,41.725050,41.728232,41.728232,1
2060,6,VN CMP,2019-03-27 09:58:39,2019-03-27 09:58:39,41.725050,41.725050,41.728243,41.728243,1
2060,8,ANDERNACH,2019-03-30 12:28:28,2019-03-30 12:28:28,41.725052,41.725052,41.728248,41.728248,1
2078,2,USMSY>DOHAI,2019-03-20 10:37:45,2019-03-20 10:37:45,17.783928,17.783928,-70.517782,-70.517782,1
...,...,...,...,...,...,...,...,...,...
926416334,27,NOVI-SAD,2019-03-27 15:35:10,2019-03-27 15:35:10,45.251007,45.251007,19.906517,19.906517,1
926416334,28,CONSTANTA,2019-03-27 15:42:26,2019-03-28 18:29:12,45.250868,44.688962,19.906155,22.404160,206
926416334,29,BOTANY BAY,2019-03-28 18:31:44,2019-03-28 18:31:44,44.693363,44.693363,22.407612,22.407612,1
926416334,30,CONSTANTA,2019-03-28 18:41:12,2019-04-04 17:19:24,44.707773,44.112117,22.424920,28.643443,1158


In [21]:
routes_df[routes_df['route_count'] == 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count
mmsi,route,destination,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4781030,3,NOVI SAD,2019-03-24 09:35:45,2019-03-24 09:38:46,44.029993,44.031337,26.374983,26.388475,2
111111111,3,16,2019-03-17 00:02:27,2019-03-17 00:02:52,4.903215,-5.979007,-1.092560,105.851497,2
111111111,5,16,2019-03-17 00:07:41,2019-03-17 00:07:45,36.992958,-3.997083,122.864695,115.724860,2
111111111,21,16,2019-03-17 01:27:11,2019-03-17 01:27:33,4.895008,-3.997285,-1.142972,115.724672,2
111111111,23,16,2019-03-17 01:30:13,2019-03-17 01:30:42,-3.997297,-5.978998,115.724702,105.851505,2
...,...,...,...,...,...,...,...,...,...
764163140,41,MIDIA<<,2019-03-19 07:27:39,2019-03-19 07:38:59,44.100485,44.099885,28.543573,28.515673,2
764163140,42,RUSE>>,2019-03-19 08:42:58,2019-03-19 08:54:59,44.166830,44.184350,28.412252,28.395948,2
764163140,44,RUSE>>,2019-03-19 09:19:59,2019-03-19 09:30:18,44.225503,44.239713,28.374888,28.359003,2
764163140,46,RUSE>>,2019-03-19 09:50:59,2019-03-19 10:00:59,44.249912,44.249230,28.314042,28.289203,2


In [22]:
np.unique(routes_df[routes_df['route_count'] < 20]["route_count"].values, return_counts=True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19]),
 array([44028, 11846,  5769,  3747,  2531,  1929,  1435,  1281,  1070,
          908,   805,   653,   569,   534,   449,   450,   388,   345,
          320]))

In [23]:
routes_df['route_time'] = routes_df['dt_insert_utc_last'] - routes_df['dt_insert_utc_first']

In [24]:
!pip install geopy
import geopy.distance



In [25]:
routes_df['distance'] = [geopy.distance.geodesic([latitude_first, longitude_first], [latitude_last, longitude_last]).km
                         for (latitude_first, longitude_first, latitude_last, longitude_last) in
                         zip(routes_df['latitude_first'], routes_df['longitude_first'], routes_df['latitude_last'], routes_df['longitude_last'])]

In [26]:
routes_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance
mmsi,route,destination,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2060,2,VIARREGIO,2019-03-20 06:19:28,2019-03-20 06:19:28,41.725052,41.725052,41.728248,41.728248,1,0 days 00:00:00,0.000000
2060,4,SEATTLE,2019-03-23 10:53:19,2019-03-23 10:53:19,41.725050,41.725050,41.728232,41.728232,1,0 days 00:00:00,0.000000
2060,6,VN CMP,2019-03-27 09:58:39,2019-03-27 09:58:39,41.725050,41.725050,41.728243,41.728243,1,0 days 00:00:00,0.000000
2060,8,ANDERNACH,2019-03-30 12:28:28,2019-03-30 12:28:28,41.725052,41.725052,41.728248,41.728248,1,0 days 00:00:00,0.000000
2078,2,USMSY>DOHAI,2019-03-20 10:37:45,2019-03-20 10:37:45,17.783928,17.783928,-70.517782,-70.517782,1,0 days 00:00:00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
926416334,27,NOVI-SAD,2019-03-27 15:35:10,2019-03-27 15:35:10,45.251007,45.251007,19.906517,19.906517,1,0 days 00:00:00,0.000000
926416334,28,CONSTANTA,2019-03-27 15:42:26,2019-03-28 18:29:12,45.250868,44.688962,19.906155,22.404160,206,1 days 02:46:46,206.709616
926416334,29,BOTANY BAY,2019-03-28 18:31:44,2019-03-28 18:31:44,44.693363,44.693363,22.407612,22.407612,1,0 days 00:00:00,0.000000
926416334,30,CONSTANTA,2019-03-28 18:41:12,2019-04-04 17:19:24,44.707773,44.112117,22.424920,28.643443,1158,6 days 22:38:12,499.593331


In [27]:
routes_df['real_route'] = routes_df['distance'] > 0.5

In [28]:
routes_df.groupby('real_route')['route_count'].count()

real_route
False    68177
True     28700
Name: route_count, dtype: int64

In [29]:
routes_df.groupby('real_route')['route_count'].sum()

real_route
False    1723660
True     3688891
Name: route_count, dtype: int64

In [30]:
real_routes = routes_df[routes_df['distance'] > 50].reset_index()

In [31]:
real_routes['consecutive'] = real_routes['route'] - real_routes['route'].shift(1)
real_routes[real_routes['consecutive'] == 0]

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance,real_route,consecutive
193,210104000,1,NIKOLAEV,2019-03-17 00:06:48,2019-03-22 10:41:31,46.347902,46.946862,31.042153,31.99409,883,5 days 10:34:43,98.710781,True,0.0
320,214181402,1,NAPOLI,2019-03-17 11:22:49,2019-03-21 15:19:24,45.43049,40.767205,36.699197,27.595315,283,4 days 03:56:35,903.355681,True,0.0
742,229811000,12,TRAMB,2019-03-27 06:52:08,2019-03-27 06:56:37,45.44726,8.913888,28.274265,-79.521182,2,0 days 00:04:29,10665.498604,True,0.0
1012,244020098,1,GE PTI POTI,2019-03-17 00:08:37,2019-03-19 05:06:03,51.277483,48.2935,4.333567,-6.2386,972,2 days 04:57:26,829.554304,True,0.0
1468,253372000,1,VARNA,2019-03-17 00:10:24,2019-03-22 21:37:33,36.456035,40.214438,-1.490067,26.454953,191,5 days 21:27:09,2468.067786,True,0.0
1636,256183000,3,RU TAG,2019-03-18 15:38:23,2019-03-18 19:50:02,41.943105,42.147405,32.813602,33.502595,3,0 days 04:11:39,61.390877,True,0.0
1998,264162458,1,CONSTANTA,2019-03-17 00:06:29,2019-03-18 21:53:49,44.161667,44.161673,-2.7743,28.65581,491,1 days 21:47:20,2498.523323,True,0.0
2081,264163264,3,CONSTANTA,2019-03-18 11:53:25,2019-03-22 12:34:19,45.269718,44.001803,19.860607,22.938145,727,4 days 00:40:54,281.890623,True,0.0
2450,271001038,1,TRGEM,2019-03-17 00:02:32,2019-03-18 01:33:43,41.265167,40.438,29.076667,29.122333,151,1 days 01:31:11,91.938396,True,0.0
2713,271040007,1,CONSTANTA,2019-03-17 00:07:26,2019-03-17 06:55:26,40.249472,40.804457,26.507158,28.241383,32,0 days 06:48:00,159.343544,True,0.0


In [32]:
routes_df = routes_df.reset_index()

In [33]:
routes_df[routes_df['mmsi'] == 264162493].sort_values(by=['route'])

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance,real_route
21566,264162493,1,SULINA,2019-03-21 04:18:11,2019-03-21 09:27:22,45.19665,45.172753,28.790003,29.477312,18,0 days 05:09:11,54.082775,True
21567,264162493,2,TULCEA,2019-03-22 18:52:59,2019-03-22 19:02:57,45.173902,45.18205,29.003275,28.989367,2,0 days 00:09:58,1.419577,True
21568,264162493,3,SULINA,2019-03-22 19:15:08,2019-03-22 19:15:08,45.191992,45.191992,28.972425,28.972425,1,0 days 00:00:00,0.0,False
21569,264162493,4,TULCEA,2019-03-22 19:26:08,2019-03-22 19:47:38,45.193892,45.190358,28.952832,28.914788,3,0 days 00:21:30,3.015234,True
21570,264162493,5,SULINA,2019-03-22 19:58:09,2019-03-22 19:58:09,45.186913,45.186913,28.896172,28.896172,1,0 days 00:00:00,0.0,False
21571,264162493,6,TULCEA,2019-03-22 20:08:18,2019-03-22 21:12:28,45.192417,45.196733,28.878757,28.789972,7,0 days 01:04:10,6.993156,True
21572,264162493,7,BRAILA>GROPENI,2019-03-25 04:22:43,2019-03-25 17:23:22,45.197127,45.433413,28.789993,28.188215,69,0 days 13:00:39,54.002182,True
21573,264162493,8,TULCEA,2019-03-26 03:42:32,2019-03-26 06:01:23,45.4334,45.415115,28.188257,28.039555,20,0 days 02:18:51,11.813904,True
21574,264162493,9,BRAILA>GROPENI,2019-03-26 06:09:22,2019-03-26 06:09:22,45.40682,45.40682,28.03021,28.03021,1,0 days 00:00:00,0.0,False
21575,264162493,10,TULCEA,2019-03-26 06:13:42,2019-03-26 09:11:34,45.401603,45.189667,28.027473,27.945155,25,0 days 02:57:52,24.423152,True


In [34]:
routes_df = routes_df[routes_df['distance'] != 0]
routes_df

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance,real_route
13,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-18 08:59:04,43.981573,44.032652,26.121168,26.395267,11,0 days 02:02:19,22.702591,True
15,2609076,3,CONSTANTA,2019-03-18 09:19:25,2019-03-18 10:53:45,44.040135,44.081035,26.452663,26.711707,12,0 days 01:34:20,21.247413,True
17,2609076,5,CONSTANTA,2019-03-18 11:15:04,2019-03-22 23:45:55,44.078045,44.120152,26.767333,28.644393,431,4 days 12:30:51,150.370561,True
18,2609076,6,CORABIA,2019-03-23 00:01:54,2019-03-23 00:59:01,44.120182,44.120142,28.644377,28.644448,5,0 days 00:57:07,0.007257,False
20,2609076,8,CORABIA,2019-03-23 01:26:02,2019-03-25 14:54:12,44.120112,43.985517,28.644448,26.159403,161,2 days 13:28:10,199.692188,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96869,926416334,24,CONSTANTA,2019-03-27 13:59:10,2019-03-27 14:11:10,45.258735,45.263042,19.891665,19.879500,4,0 days 00:12:00,1.068057,True
96871,926416334,26,CONSTANTA,2019-03-27 14:23:10,2019-03-27 15:32:24,45.264937,45.251797,19.864587,19.907097,14,0 days 01:09:14,3.642256,True
96873,926416334,28,CONSTANTA,2019-03-27 15:42:26,2019-03-28 18:29:12,45.250868,44.688962,19.906155,22.404160,206,1 days 02:46:46,206.709616,True
96875,926416334,30,CONSTANTA,2019-03-28 18:41:12,2019-04-04 17:19:24,44.707773,44.112117,22.424920,28.643443,1158,6 days 22:38:12,499.593331,True


In [35]:
routes_df['destination_lag'] = routes_df['destination'].shift(1)

In [36]:
routes_df['new_route'] =  routes_df.groupby('mmsi').apply(
    lambda x: (x['destination'] != x['destination_lag']).cumsum()
).reset_index(level=0, drop=True)

In [37]:
new_routes_df = routes_df.groupby(['mmsi', 'new_route', 'destination']).agg(
    {
        'dt_insert_utc_first': 'first',
        'dt_insert_utc_last': 'last',
        'latitude_first': 'first',
        'latitude_last': 'last',
        'longitude_first': 'first',
        'longitude_last': 'last',
    }
).reset_index()
new_routes_df

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707
...,...,...,...,...,...,...,...,...,...
24122,764163140,19,MIDIA>>,2019-03-27 05:21:20,2019-04-01 16:20:46,44.006090,44.324732,26.220532,28.627715
24123,863227588,1,OPL KAVKAZ,2019-04-01 05:36:27,2019-04-01 08:57:48,45.373048,45.012038,36.662702,36.546737
24124,926416334,1,NOVI-SAD,2019-03-17 13:43:48,2019-03-26 23:55:10,48.979617,45.223505,12.039143,19.667515
24125,926416334,2,CONSTANTA,2019-03-27 00:05:14,2019-04-04 17:19:24,45.230195,44.112117,19.697080,28.643443


In [38]:
new_routes_df['origin'] = new_routes_df.groupby('mmsi')['destination'].shift(1)

In [39]:
new_routes_df

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,"
...,...,...,...,...,...,...,...,...,...,...
24122,764163140,19,MIDIA>>,2019-03-27 05:21:20,2019-04-01 16:20:46,44.006090,44.324732,26.220532,28.627715,RUSE>>
24123,863227588,1,OPL KAVKAZ,2019-04-01 05:36:27,2019-04-01 08:57:48,45.373048,45.012038,36.662702,36.546737,
24124,926416334,1,NOVI-SAD,2019-03-17 13:43:48,2019-03-26 23:55:10,48.979617,45.223505,12.039143,19.667515,
24125,926416334,2,CONSTANTA,2019-03-27 00:05:14,2019-04-04 17:19:24,45.230195,44.112117,19.697080,28.643443,NOVI-SAD


In [69]:
new_routes_df['reference_week_first_dt'] = (new_routes_df['dt_insert_utc_first'] > '2019-04-01') & (new_routes_df['dt_insert_utc_first'] <= '2019-04-07')
new_routes_df['reference_week_last_dt'] = (new_routes_df['dt_insert_utc_last'] > '2019-04-01') & (new_routes_df['dt_insert_utc_last'] <= '2019-04-07')

In [70]:
new_routes_df['next_destination'] = new_routes_df.groupby('mmsi')['destination'].shift(-1)
new_routes_df

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week,next_destination,reference_week_first_dt,reference_week_last_dt
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,,False,CORABIA,False,False
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA,False,CONSTANTA,False,False
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA,False,"HARSOVA ,",False,True
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA,True,CONSTANTA,True,True
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,",True,ZIMNICEA,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24122,764163140,19,MIDIA>>,2019-03-27 05:21:20,2019-04-01 16:20:46,44.006090,44.324732,26.220532,28.627715,RUSE>>,False,,False,True
24123,863227588,1,OPL KAVKAZ,2019-04-01 05:36:27,2019-04-01 08:57:48,45.373048,45.012038,36.662702,36.546737,,True,,True,True
24124,926416334,1,NOVI-SAD,2019-03-17 13:43:48,2019-03-26 23:55:10,48.979617,45.223505,12.039143,19.667515,,False,CONSTANTA,False,False
24125,926416334,2,CONSTANTA,2019-03-27 00:05:14,2019-04-04 17:19:24,45.230195,44.112117,19.697080,28.643443,NOVI-SAD,False,CORABIA,False,True


In [71]:
ship_data_enriched = fc.create_ship_data_enriched()

In [72]:
new_routes_df_enriched = new_routes_df.merge(
    ship_data_enriched,
    left_on='mmsi',
    right_on="MaritimeMobileServiceIdentityMMSINumber",
    how='left'
)
new_routes_df_enriched

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week,next_destination,reference_week_first_dt,reference_week_last_dt,LRIMOShipNo,...,TotalKilowattsofMainEngines,TotalPowerOfAllEngines,TotalPowerOfAuxiliaryEngines,ShiptypeLevel5_y,Level4Code,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,,False,CORABIA,False,False,,...,,,,,,,,,,,,,,,
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA,False,CONSTANTA,False,False,,...,,,,,,,,,,,,,,,
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA,False,"HARSOVA ,",False,True,,...,,,,,,,,,,,,,,,
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA,True,CONSTANTA,True,True,,...,,,,,,,,,,,,,,,
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,",True,ZIMNICEA,True,False,,...,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24122,764163140,19,MIDIA>>,2019-03-27 05:21:20,2019-04-01 16:20:46,44.006090,44.324732,26.220532,28.627715,RUSE>>,False,,False,True,,...,,,,,,,,,,,,,,,
24123,863227588,1,OPL KAVKAZ,2019-04-01 05:36:27,2019-04-01 08:57:48,45.373048,45.012038,36.662702,36.546737,,True,,True,True,,...,,,,,,,,,,,,,,,
24124,926416334,1,NOVI-SAD,2019-03-17 13:43:48,2019-03-26 23:55:10,48.979617,45.223505,12.039143,19.667515,,False,CONSTANTA,False,False,,...,,,,,,,,,,,,,,,
24125,926416334,2,CONSTANTA,2019-03-27 00:05:14,2019-04-04 17:19:24,45.230195,44.112117,19.697080,28.643443,NOVI-SAD,False,CORABIA,False,True,,...,,,,,,,,,,,,,,,


In [73]:
new_routes_df_enriched.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24127 entries, 0 to 24126
Columns: 133 entries, mmsi to SubType
dtypes: bool(3), datetime64[ns](2), float64(64), int64(2), object(62)
memory usage: 24.2+ MB


What should we impute ?

In [74]:
new_routes_df_enriched.columns

Index(['mmsi', 'new_route', 'destination', 'dt_insert_utc_first',
       'dt_insert_utc_last', 'latitude_first', 'latitude_last',
       'longitude_first', 'longitude_last', 'origin',
       ...
       'ShipTypeLevel4', 'Level3Code', 'ShipTypeLevel3', 'Level2Code',
       'ShipTypeLevel2', 'ShipTypeLevel1Code', 'ShiptypeLevel1', 'HullType',
       'SubGroup', 'SubType'],
      dtype='object', length=133)

In [75]:
new_routes_df_enriched['ShipTypeLevel3'] = new_routes_df_enriched['ShipTypeLevel3'].fillna(new_routes_df_enriched.ShipTypeLevel3.mode().iloc[0])

In [76]:
new_routes_df_enriched['GrossTonnage'] = new_routes_df_enriched['GrossTonnage'].fillna(new_routes_df_enriched.GrossTonnage.mean())
new_routes_df_enriched['NetTonnage'] = new_routes_df_enriched['NetTonnage'].fillna(new_routes_df_enriched.NetTonnage.mean())

Departure counts

In [77]:
new_routes_df_enriched.head()

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week,next_destination,reference_week_first_dt,reference_week_last_dt,LRIMOShipNo,...,TotalKilowattsofMainEngines,TotalPowerOfAllEngines,TotalPowerOfAuxiliaryEngines,ShiptypeLevel5_y,Level4Code,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,,False,CORABIA,False,False,,...,,,,,,,,General Cargo,,,,,,,
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA,False,CONSTANTA,False,False,,...,,,,,,,,General Cargo,,,,,,,
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA,False,"HARSOVA ,",False,True,,...,,,,,,,,General Cargo,,,,,,,
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA,True,CONSTANTA,True,True,,...,,,,,,,,General Cargo,,,,,,,
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,",True,ZIMNICEA,True,False,,...,,,,,,,,General Cargo,,,,,,,


In [82]:
departure_counts = pd.DataFrame(
    new_routes_df_enriched[new_routes_df_enriched.reference_week_first_dt].groupby(['origin', 'destination', 'ShipTypeLevel3']).agg(
        {
            'mmsi': 'count',
            'GrossTonnage': 'sum',
            'NetTonnage': 'sum'
        }
    )
).reset_index().rename(
    columns={'mmsi': 'count',
             'origin': 'port'}
)

In [83]:
departure_counts

Unnamed: 0,port,destination,ShipTypeLevel3,count,GrossTonnage,NetTonnage
0,!AB C E EL,AC BJ E EO,General Cargo,1,14453.298196,7691.216159
1,"""BDR$6.)!Y^5;FNC4(G",SAMSUN,General Cargo,1,2598.000000,1163.000000
2,-,-C?M,Fish Catching,1,721.000000,216.000000
3,-,-[C?<)RD\K$OQ8),Fish Catching,1,721.000000,216.000000
4,-,KRYM-KAVKAZ,General Cargo,4,57813.192782,30764.864637
...,...,...,...,...,...,...
3523,ZPR,KHERSON,General Cargo,1,1659.000000,498.000000
3524,ZPR,SAMSUN,General Cargo,1,2466.000000,988.000000
3525,_,C08!TTAA>5,General Cargo,1,14453.298196,7691.216159
3526,_,ROSTOV,General Cargo,1,14453.298196,7691.216159


Arrivals

In [84]:
new_routes_df_enriched.head()

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week,next_destination,reference_week_first_dt,reference_week_last_dt,LRIMOShipNo,...,TotalKilowattsofMainEngines,TotalPowerOfAllEngines,TotalPowerOfAuxiliaryEngines,ShiptypeLevel5_y,Level4Code,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,,False,CORABIA,False,False,,...,,,,,,,,General Cargo,,,,,,,
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA,False,CONSTANTA,False,False,,...,,,,,,,,General Cargo,,,,,,,
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA,False,"HARSOVA ,",False,True,,...,,,,,,,,General Cargo,,,,,,,
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA,True,CONSTANTA,True,True,,...,,,,,,,,General Cargo,,,,,,,
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,",True,ZIMNICEA,True,False,,...,,,,,,,,General Cargo,,,,,,,


In [85]:
arrival_counts = pd.DataFrame(
    new_routes_df_enriched[new_routes_df_enriched.reference_week_last_dt].groupby(['destination', 'origin', 'ShipTypeLevel3']).agg(
        {
            'mmsi': 'count',
            'GrossTonnage': 'sum',
            'NetTonnage': 'sum'
        }
    )
).reset_index().rename(
    columns={'mmsi': 'count',
             'destination': 'port'}
)

Export data

In [99]:
import s3fs
fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://minio.lab.sspcloud.fr'}
)

In [101]:
path = "AIS/departure_counts_april_19.csv"
bucket = "projet-hackathon-un-2022"

departure_counts.to_csv(
    fs.open(f'{bucket}/{path}',
            mode='w')
)

In [102]:
path = "AIS/arrival_counts_april_19.csv"

arrival_counts.to_csv(
    fs.open(f'{bucket}/{path}',
            mode='w')
)

## 2022

In [103]:
path_parquet = "AIS/ais_azov_black_20220401_20220408_full_traces_before.parquet"

ais_df = fc.read_ais_parquet(path_parquet=path_parquet)

In [104]:
ais_df.shape

(4642595, 6)

## RESET : enchaînement des destinations ? 

In [105]:
ais_df.shape

(4642595, 6)

In [106]:
ais_df = ais_df.sort_values(by=['mmsi', 'dt_insert_utc'])

In [107]:
destination_mapping = {
    destination: index for destination, index in zip(ais_df.destination.unique(), np.arange(ais_df.destination.nunique()))
}

In [108]:
ais_df['destination_index'] = ais_df['destination'].map(destination_mapping)
ais_df['destination_index'] = ais_df['destination_index'].fillna(-1)
ais_df['destination_index'] = ais_df['destination_index'].astype(int)

In [109]:
def assign_routes_to_group(x):
    return x['destination_index'].diff().ne(0).cumsum()

In [110]:
ais_df['route'] =  ais_df.groupby('mmsi').apply(
    assign_routes_to_group
).reset_index(level=0, drop=True)

In [111]:
routes_df = ais_df.groupby(['mmsi', 'route', 'destination']).agg(
    {
        'dt_insert_utc': ['first', 'last'], 
        'latitude': ['first', 'last'], 
        'longitude': ['first', 'last'],
        'route': 'count'
    }
)

In [112]:
routes_df.columns = routes_df.columns.map('_'.join).str.strip('_')

In [113]:
routes_df.reset_index()

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,4
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,820
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198,380
3,203999385,1,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,9
4,203999385,2,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,389
...,...,...,...,...,...,...,...,...,...,...
14994,677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860,14
14995,677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107,1533
14996,750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957,40
14997,750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137,1029


In [114]:
routes_df.sort_values(['mmsi', 'route'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count
mmsi,route,destination,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,4
203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,820
203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198,380
203999385,1,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,9
203999385,2,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,389
...,...,...,...,...,...,...,...,...,...
677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860,14
677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107,1533
750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957,40
750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137,1029


In [115]:
routes_df[routes_df['route_count'] == 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count
mmsi,route,destination,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
207072433,1,CHICIU,2022-04-01 04:58:14,2022-04-01 05:16:13,45.280943,45.280938,27.984675,27.984680,2
207261414,2,GIURGIU,2022-04-05 12:23:43,2022-04-05 12:33:42,44.101082,44.101062,28.602585,28.577375,2
211191580,2,STRAUBING,2022-04-04 18:25:55,2022-04-08 11:36:03,43.863925,44.432337,25.956177,22.484270,2
211519830,4,IZMAIBJP,2022-03-17 08:43:46,2022-03-17 08:54:01,45.446493,45.446493,28.104132,28.104123,2
211519830,32,IZMAIBJP,2022-03-20 12:11:02,2022-03-20 12:21:41,45.446493,45.446505,28.104117,28.104125,2
...,...,...,...,...,...,...,...,...,...
677057000,127,TR SSX,2022-04-05 06:02:33,2022-04-05 06:06:30,25.302872,25.302905,55.334120,55.334082,2
677057000,144,RU TUA,2022-04-06 23:36:40,2022-04-06 23:39:42,25.303440,43.966158,55.334512,38.994285,2
677057000,145,SHARJAH,2022-04-06 23:49:38,2022-04-06 23:52:15,43.976575,25.303353,39.004480,55.334535,2
677057000,153,SHARJAH,2022-04-07 18:42:54,2022-04-07 18:46:59,25.303373,43.570837,55.334492,38.570412,2


In [116]:
np.unique(routes_df[routes_df['route_count'] < 20]["route_count"].values, return_counts=True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19]),
 array([2392, 1755,  882,  599,  471,  285,  260,  186,  171,  149,  121,
         123,  111,  108,   82,   91,   76,   80,   80]))

In [117]:
routes_df['route_time'] = routes_df['dt_insert_utc_last'] - routes_df['dt_insert_utc_first']

In [118]:
routes_df['distance'] = [geopy.distance.geodesic([latitude_first, longitude_first], [latitude_last, longitude_last]).km
                         for (latitude_first, longitude_first, latitude_last, longitude_last) in
                         zip(routes_df['latitude_first'], routes_df['longitude_first'], routes_df['latitude_last'], routes_df['longitude_last'])]

In [119]:
routes_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance
mmsi,route,destination,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,4,10 days 20:26:44,291.628183
203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,820,4 days 11:04:13,72.368884
203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198,380,1 days 19:50:24,83.551897
203999385,1,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,9,7 days 20:48:32,553.696268
203999385,2,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,389,1 days 23:28:48,71.938351
...,...,...,...,...,...,...,...,...,...,...,...
677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860,14,0 days 03:05:52,29.378800
677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107,1533,11 days 05:40:15,500.763708
750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957,40,2 days 00:33:55,402.277709
750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137,1029,10 days 03:51:40,832.690948


In [120]:
routes_df['real_route'] = routes_df['distance'] > 0.5

In [121]:
routes_df.groupby('real_route')['route_count'].count()

real_route
False    6732
True     8267
Name: route_count, dtype: int64

In [122]:
routes_df.groupby('real_route')['route_count'].sum()

real_route
False     990897
True     2673264
Name: route_count, dtype: int64

In [123]:
real_routes = routes_df[routes_df['distance'] > 50].reset_index()

In [124]:
real_routes['consecutive'] = real_routes['route'] - real_routes['route'].shift(1)
real_routes[real_routes['consecutive'] == 0]

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance,real_route,consecutive
16,207072362,1,RUSE,2022-03-17 01:23:12,2022-03-30 00:33:12,43.646667,44.341667,25.308333,28.018333,41,12 days 23:10:00,230.675151,True,0.0
21,207261205,1,CONSTANTA,2022-03-31 05:57:48,2022-04-08 23:54:50,43.858410,44.115085,25.956097,28.643325,784,8 days 17:57:02,217.448482,True,0.0
22,207261209,1,RUSE,2022-03-19 15:39:45,2022-03-20 15:22:55,43.832772,44.132113,25.925093,27.277185,3,0 days 23:43:10,113.460957,True,0.0
25,207277000,1,BGBOJ,2022-03-17 00:06:54,2022-03-18 23:55:04,42.342872,42.485278,35.028608,27.454992,240,1 days 23:48:10,623.389002,True,0.0
35,211191580,1,LOM,2022-03-19 10:03:24,2022-04-01 19:29:46,48.306890,44.067578,14.264445,26.659105,55,13 days 09:26:22,1064.912499,True,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4519,636021246,2,ITGIT,2022-03-18 19:45:21,2022-03-21 16:57:28,45.456635,38.465290,12.257657,15.911782,238,2 days 21:12:07,833.299968,True,0.0
4617,667001466,1,EG DAM,2022-03-17 00:06:57,2022-03-17 06:44:12,40.022123,38.947463,26.106905,25.731308,41,0 days 06:37:15,123.611726,True,0.0
4642,667001710,2,ORDER,2022-03-27 18:44:18,2022-03-31 14:59:51,34.513873,41.427860,34.080007,29.174417,342,3 days 20:15:33,879.805556,True,0.0
4827,677021400,1,KOPER,2022-03-17 00:05:04,2022-03-24 14:48:09,36.876667,45.558473,2.966667,13.744502,1505,7 days 14:43:05,1319.204795,True,0.0


In [125]:
routes_df = routes_df.reset_index()

In [127]:
routes_df = routes_df[routes_df['distance'] != 0]
routes_df

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance,real_route
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,4,10 days 20:26:44,291.628183,True
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,820,4 days 11:04:13,72.368884,True
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198,380,1 days 19:50:24,83.551897,True
3,203999385,1,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,9,7 days 20:48:32,553.696268,True
4,203999385,2,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,389,1 days 23:28:48,71.938351,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860,14,0 days 03:05:52,29.378800,True
14995,677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107,1533,11 days 05:40:15,500.763708,True
14996,750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957,40,2 days 00:33:55,402.277709,True
14997,750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137,1029,10 days 03:51:40,832.690948,True


In [128]:
routes_df['destination_lag'] = routes_df['destination'].shift(1)

In [129]:
routes_df['new_route'] =  routes_df.groupby('mmsi').apply(
    lambda x: (x['destination'] != x['destination_lag']).cumsum()
).reset_index(level=0, drop=True)

In [130]:
new_routes_df = routes_df.groupby(['mmsi', 'new_route', 'destination']).agg(
    {
        'dt_insert_utc_first': 'first',
        'dt_insert_utc_last': 'last',
        'latitude_first': 'first',
        'latitude_last': 'last',
        'longitude_first': 'first',
        'longitude_last': 'last',
    }
).reset_index()
new_routes_df

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198
3,203999385,0,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915
4,203999385,1,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393
...,...,...,...,...,...,...,...,...,...
9747,677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860
9748,677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107
9749,750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957
9750,750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137


In [131]:
new_routes_df['origin'] = new_routes_df.groupby('mmsi')['destination'].shift(1)

In [132]:
new_routes_df

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,SMEDEREVO
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198,IZMAIL
3,203999385,0,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,
4,203999385,1,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,SMEDEREVO
...,...,...,...,...,...,...,...,...,...,...
9747,677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860,CONSTANTA
9748,677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107,CONSTANTABSEASTAR
9749,750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957,
9750,750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137,TR MER


In [134]:
new_routes_df['reference_week_first_dt'] = (new_routes_df['dt_insert_utc_first'] > '2022-04-01') & (new_routes_df['dt_insert_utc_first'] <= '2022-04-07')
new_routes_df['reference_week_last_dt'] = (new_routes_df['dt_insert_utc_last'] > '2022-04-01') & (new_routes_df['dt_insert_utc_last'] <= '2022-04-07')

In [135]:
new_routes_df['next_destination'] = new_routes_df.groupby('mmsi')['destination'].shift(-1)
new_routes_df

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week_first_dt,reference_week_last_dt,next_destination
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,,False,False,IZMAIL
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,SMEDEREVO,False,True,SMEDEREVO
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198,IZMAIL,True,True,
3,203999385,0,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,,False,True,IZMAIL
4,203999385,1,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,SMEDEREVO,True,True,SMEDEREVO
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9747,677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860,CONSTANTA,False,False,CONSTANTA
9748,677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107,CONSTANTABSEASTAR,False,False,
9749,750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957,,False,False,RO CND
9750,750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137,TR MER,False,False,TR NEM


In [136]:
ship_data_enriched = fc.create_ship_data_enriched()

In [137]:
new_routes_df_enriched = new_routes_df.merge(
    ship_data_enriched,
    left_on='mmsi',
    right_on="MaritimeMobileServiceIdentityMMSINumber",
    how='left'
)
new_routes_df_enriched

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week_first_dt,reference_week_last_dt,next_destination,LRIMOShipNo,StatCode5,...,TotalKilowattsofMainEngines,TotalPowerOfAllEngines,TotalPowerOfAuxiliaryEngines,ShiptypeLevel5_y,Level4Code,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,,False,False,IZMAIL,,,...,,,,,,,,,,,,,,,
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,SMEDEREVO,False,True,SMEDEREVO,,,...,,,,,,,,,,,,,,,
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.971210,28.842605,27.901198,IZMAIL,True,True,,,,...,,,,,,,,,,,,,,,
3,203999385,0,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,,False,True,IZMAIL,,,...,,,,,,,,,,,,,,,
4,203999385,1,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,SMEDEREVO,True,True,SMEDEREVO,,,...,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9747,677096200,7,CONSTANTABSEASTAR,2022-03-28 14:49:23,2022-03-28 17:55:15,39.906448,40.001940,25.783222,26.103860,CONSTANTA,False,False,CONSTANTA,9016155.0,A31A2GX,...,2942.0,3442.0,500.0,General Cargo Ship,A31A,General Cargo Ship,A31,General Cargo,A3,Dry Cargo/Passenger,A,Cargo Carrying,Ship Shape Including Multi-Hulls,General Cargo,General Cargo
9748,677096200,8,CONSTANTA,2022-03-28 18:08:43,2022-04-08 23:48:58,40.005000,44.101322,26.136667,28.665107,CONSTANTABSEASTAR,False,False,,9016155.0,A31A2GX,...,2942.0,3442.0,500.0,General Cargo Ship,A31A,General Cargo Ship,A31,General Cargo,A3,Dry Cargo/Passenger,A,Cargo Carrying,Ship Shape Including Multi-Hulls,General Cargo,General Cargo
9749,750332000,1,TR MER,2022-03-17 18:41:04,2022-03-19 19:14:59,35.984062,36.750380,32.112638,27.730957,,False,False,RO CND,,,...,,,,,,,,,,,,,,,
9750,750332000,2,RO CND,2022-03-21 09:11:26,2022-03-31 13:03:06,36.640650,44.102088,27.684393,28.665137,TR MER,False,False,TR NEM,,,...,,,,,,,,,,,,,,,


In [138]:
new_routes_df_enriched.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9752 entries, 0 to 9751
Columns: 132 entries, mmsi to SubType
dtypes: bool(2), datetime64[ns](2), float64(64), int64(2), object(62)
memory usage: 9.8+ MB


What should we impute ?

In [139]:
new_routes_df_enriched.columns

Index(['mmsi', 'new_route', 'destination', 'dt_insert_utc_first',
       'dt_insert_utc_last', 'latitude_first', 'latitude_last',
       'longitude_first', 'longitude_last', 'origin',
       ...
       'ShipTypeLevel4', 'Level3Code', 'ShipTypeLevel3', 'Level2Code',
       'ShipTypeLevel2', 'ShipTypeLevel1Code', 'ShiptypeLevel1', 'HullType',
       'SubGroup', 'SubType'],
      dtype='object', length=132)

In [140]:
new_routes_df_enriched['ShipTypeLevel3'] = new_routes_df_enriched['ShipTypeLevel3'].fillna(new_routes_df_enriched.ShipTypeLevel3.mode().iloc[0])

In [141]:
new_routes_df_enriched['GrossTonnage'] = new_routes_df_enriched['GrossTonnage'].fillna(new_routes_df_enriched.GrossTonnage.mean())
new_routes_df_enriched['NetTonnage'] = new_routes_df_enriched['NetTonnage'].fillna(new_routes_df_enriched.NetTonnage.mean())

Departure counts

In [142]:
new_routes_df_enriched.head()

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week_first_dt,reference_week_last_dt,next_destination,LRIMOShipNo,StatCode5,...,TotalKilowattsofMainEngines,TotalPowerOfAllEngines,TotalPowerOfAuxiliaryEngines,ShiptypeLevel5_y,Level4Code,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,,False,False,IZMAIL,,,...,,,,,,,,General Cargo,,,,,,,
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,SMEDEREVO,False,True,SMEDEREVO,,,...,,,,,,,,General Cargo,,,,,,,
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.97121,28.842605,27.901198,IZMAIL,True,True,,,,...,,,,,,,,General Cargo,,,,,,,
3,203999385,0,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,,False,True,IZMAIL,,,...,,,,,,,,General Cargo,,,,,,,
4,203999385,1,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,SMEDEREVO,True,True,SMEDEREVO,,,...,,,,,,,,General Cargo,,,,,,,


In [143]:
departure_counts = pd.DataFrame(
    new_routes_df_enriched[new_routes_df_enriched.reference_week_first_dt].groupby(['origin', 'destination', 'ShipTypeLevel3']).agg(
        {
            'mmsi': 'count',
            'GrossTonnage': 'sum',
            'NetTonnage': 'sum'
        }
    )
).reset_index().rename(
    columns={'mmsi': 'count',
             'origin': 'port'}
)

In [144]:
departure_counts

Unnamed: 0,port,destination,ShipTypeLevel3,count,GrossTonnage,NetTonnage
0,2,GALATI,General Cargo,1,18306.48476,10025.860808
1,416 ANCHOR AREA,NOVOROSSIYSK,General Cargo,1,12974.00000,5334.000000
2,ACCORDING PLAN,ASTRAKHAN,Towing / Pushing,1,0.00000,0.000000
3,AE FJR,RUNVS,Oil,1,60208.00000,33762.000000
4,ALEXANDRI,FOR ORDERS,Bulk Dry,1,21072.00000,11954.000000
...,...,...,...,...,...,...
1626,YARIMCA,GALATI,Liquefied Gas,1,3096.00000,928.000000
1627,YARIMCA/TURKEY,DILISKELESI/TURKEY,General Cargo,1,2970.00000,1895.000000
1628,YEYSK (RUSSIA),TEKIRDAG TURKEY,General Cargo,1,2679.00000,1271.000000
1629,ZONGULDAK,CONSTANTA,General Cargo,1,2980.00000,906.000000


Arrivals

In [145]:
new_routes_df_enriched.head()

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week_first_dt,reference_week_last_dt,next_destination,LRIMOShipNo,StatCode5,...,TotalKilowattsofMainEngines,TotalPowerOfAllEngines,TotalPowerOfAuxiliaryEngines,ShiptypeLevel5_y,Level4Code,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,203999383,1,SMEDEREVO,2022-03-18 20:08:10,2022-03-29 16:34:54,43.710167,45.159132,24.894198,27.948658,,False,False,IZMAIL,,,...,,,,,,,,General Cargo,,,,,,,
1,203999383,2,IZMAIL,2022-03-29 16:46:47,2022-04-03 03:51:00,45.178135,45.319658,27.942785,28.842597,SMEDEREVO,False,True,SMEDEREVO,,,...,,,,,,,,General Cargo,,,,,,,
2,203999383,3,SMEDEREVO,2022-04-03 03:58:40,2022-04-04 23:49:04,45.319622,44.97121,28.842605,27.901198,IZMAIL,True,True,,,,...,,,,,,,,General Cargo,,,,,,,
3,203999385,0,SMEDEREVO,2022-03-26 21:22:37,2022-04-03 18:11:09,44.695977,45.163797,20.961785,27.946915,,False,True,IZMAIL,,,...,,,,,,,,General Cargo,,,,,,,
4,203999385,1,IZMAIL,2022-04-03 18:21:49,2022-04-05 17:50:37,45.180163,45.322473,27.943413,28.837393,SMEDEREVO,True,True,SMEDEREVO,,,...,,,,,,,,General Cargo,,,,,,,


In [146]:
arrival_counts = pd.DataFrame(
    new_routes_df_enriched[new_routes_df_enriched.reference_week_last_dt].groupby(['destination', 'origin', 'ShipTypeLevel3']).agg(
        {
            'mmsi': 'count',
            'GrossTonnage': 'sum',
            'NetTonnage': 'sum'
        }
    )
).reset_index().rename(
    columns={'mmsi': 'count',
             'destination': 'port'}
)

Export data

In [147]:
import s3fs
fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://minio.lab.sspcloud.fr'}
)

In [148]:
path = "AIS/departure_counts_april_22.csv"
bucket = "projet-hackathon-un-2022"

departure_counts.to_csv(
    fs.open(f'{bucket}/{path}',
            mode='w')
)

In [149]:
path = "AIS/arrival_counts_april_22.csv"

arrival_counts.to_csv(
    fs.open(f'{bucket}/{path}',
            mode='w')
)