# United Nations - Big Data Hackathon

In [30]:
#allow multiple outputs in one jupyter cell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import geopy.distance

# to apply aggregation functions on spark df
# import pyspark.sql.functions as F

In [14]:
# this cell contains the code to access GitLab repo
# need it to install ais package from GitLab repo
import sys
import subprocess

GITLAB_USER = "read_aistt"  # read only access
GITLAB_TOKEN = "MMQ6ky1rnLsuKxjyZuvB"

# clone the repo and install the ais packag
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True).stdout
print(std_out)

Collecting git+https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-xlapdur_
  Resolved https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to commit 6463bfc66a0d153d4b5c7b5b6f4d148252f7f291
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: ais
  Building wheel for ais (setup.py): started
  Building wheel for ais (setup.py): finished with status 'done'
  Created wheel for ais: filename=ais-2.7.6-py3-none-any.whl size=9235 sha256=1d1d63af498f678c98b8eb4bd22098a97bb10e596f04f3eb94b719351e932c44
  Stored in directory: /tmp/pip-ephem-wheel-cache-iv2wg747/wheels/0e/d0/88/e6935ee881646b5fa5c06aa268b47186f3e4333cf490940e67
Successfully built ais
Installing collected packages: ais
Successfully installed ais-2.7.6



In [29]:
!pip install geopy



## Data extraction from AIS/IHS

**This section consists in PySpark code, to be run on the UNBD platform.**

### Extract AIS data for selected areas and timeranges

In [None]:
# import get_ais() from ais package
from ais import functions as af

In [5]:
def get_full_traces_ais(area, start_date, end_date, to_pandas=True):
    ## Set coordinates of the selected polygons in geojson format
    # https://boundingbox.klokantech.com/
    if area == "azov":
        bb = [[32.4143284746,45.0048840974],[40.0827855058,45.0048840974],[40.0827855058,47.9395951189],[32.4143284746,47.9395951189],[32.4143284746,45.0048840974]]
    elif area == "azov_black":
        bb = [[43.3308500839,39.9913666442],[26.1506878922,41.33737686],[27.1872912828,48.4341912681],[44.3674534746,47.2431326615],[43.3308500839,39.9913666442]]
    elif area == "suez":
        bb = [[33.979966859,27.3216575046],[31.4640977184,27.3314178101],[31.4860703747,30.787762868],[33.9689805309,30.7972004213],[33.979966859,27.3216575046]]

    polygon = {
            "type": "Polygon",
            "coordinates": [bb]
        }

    polygon_hex_df = af.polygon_to_hex_df([("Polygon", polygon)])
    
    ## Filter boats that were at least 1 time in our polygon

    start_date = datetime.fromisoformat(start_date)
    end_date = datetime.fromisoformat(end_date)
    columns = ["mmsi", "latitude", "longitude", "eeid", "dt_insert_utc", "destination"]
    df_ais_polygon = af.get_ais(spark,
                                start_date, 
                                end_date = end_date,
                                columns = columns,
                                polygon_hex_df = polygon_hex_df
                               )
    
    ## Get full traces of boats to get areas of origin

    # Get list of boats in our polygon
    unique_mmsi_polygon = df_ais_polygon.select(F.col("mmsi")).distinct().toPandas()["mmsi"].tolist()

    # Buffers to ensure getting proper origin
    start_date_buffer = start_date + timedelta(days=-15)

    # Get full traces of all boats that were at least once in our polygon
    df_full_traces = af.get_ais(spark,
    start_date_buffer,
    end_date = end_date,
    columns = columns,
    mmsi_list = unique_mmsi_polygon
    )
    
    if to_pandas:
        df_full_traces = df_full_traces.toPandas()
    
    return df_full_traces

In [None]:
df_suez_reference = get_full_traces_ais(area="suez", start_date="2019-04-01", end_date="2019-04-08")
df_suez_choke = get_full_traces_ais(area="suez", start_date="2021-03-21", end_date="2021-04-01")

In [None]:
df_black_azov_reference = get_full_traces_ais(area="azov_black", start_date="2019-04-01", end_date="2019-04-08")
df_black_azov_choke = get_full_traces_ais(area="azov_black", start_date="2022-04-01", end_date="2022-04-08")

### Enrich AIS data with information on ships from IHS data

In [None]:
def enrich_ais(df_ais, to_pandas=True):
    """Enrich AIS data using IHS data."""
    # Load and merge IHS data
    basepath = "s3a://ungp-ais-data-historical-backup/register/"
    df_ship_data_ihs = spark.read.load(basepath+ "ShipData.CSV", format="csv", sep=",", 
                                       inferSchema="true", header="true")
    df_ship_codes_ihs = spark.read.load(basepath + "tblShipTypeCodes.CSV", format="csv", sep=",",
                                        inferSchema="true", header="true")
    if to_pandas:
        df_ship_data_ihs = df_ship_data_ihs.toPandas()
        df_ship_codes_ihs = df_ship_codes_ihs.toPandas()
    df_ship_data_enriched = df_ship_data_ihs.merge(df_ship_codes_ihs, on="StatCode5")
    
    # Enrich AIS data
    df_ais_enriched = df_ais.merge(df_ship_data_enriched, left_on="mmsi", right_on="MaritimeMobileServiceIdentityMMSINumber")
    
    return df_ais_enriched

In [6]:
# Enrich AIS data

df_suez_reference = enrich_ais(df_ais=df_suez_reference)
df_suez_choke = enrich_ais(df_ais=df_suez_choke)

df_black_azov_reference = enrich_ais(df_ais=df_black_azov_reference)
df_black_azov_choke = enrich_ais(df_ais=df_black_azov_choke)

In [None]:
spark.stop()

## Compute traffic data between ports of the selected area

In [41]:
ais_df = df_black_azov_reference
ais_df.shape

(6943856, 6)

In [43]:
start_date="2019-04-01"
end_date="2019-04-08"

### Compute routes using the `destination` variable

In [19]:
ais_df = ais_df.sort_values(by=['mmsi', 'dt_insert_utc'])

In [20]:
destination_mapping = {
    destination: index for destination, index in zip(ais_df.destination.unique(), np.arange(ais_df.destination.nunique()))
}

ais_df['destination_index'] = ais_df['destination'].map(destination_mapping)
ais_df['destination_index'] = ais_df['destination_index'].fillna(-1)
ais_df['destination_index'] = ais_df['destination_index'].astype(int)

In [21]:
def assign_routes_to_group(x):
    return x['destination_index'].diff().ne(0).cumsum()

ais_df['route'] =  ais_df.groupby('mmsi').apply(
    assign_routes_to_group
).reset_index(level=0, drop=True)

In [22]:
routes_df = ais_df.groupby(['mmsi', 'route', 'destination']).agg(
    {
        'dt_insert_utc': ['first', 'last'], 
        'latitude': ['first', 'last'], 
        'longitude': ['first', 'last'],
        'route': 'count'
    }
)

routes_df.columns = routes_df.columns.map('_'.join).str.strip('_')
routes_df = routes_df.reset_index()
routes_df = routes_df.sort_values(by=['mmsi', 'route'])

In [28]:
routes_df.head(5)

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count
0,2060,2,VIARREGIO,2019-03-20 06:19:28,2019-03-20 06:19:28,41.725052,41.725052,41.728248,41.728248,1
1,2060,4,SEATTLE,2019-03-23 10:53:19,2019-03-23 10:53:19,41.72505,41.72505,41.728232,41.728232,1
2,2060,6,VN CMP,2019-03-27 09:58:39,2019-03-27 09:58:39,41.72505,41.72505,41.728243,41.728243,1
3,2060,8,ANDERNACH,2019-03-30 12:28:28,2019-03-30 12:28:28,41.725052,41.725052,41.728248,41.728248,1
4,2078,2,USMSY>DOHAI,2019-03-20 10:37:45,2019-03-20 10:37:45,17.783928,17.783928,-70.517782,-70.517782,1


### Remove unrealistic routes

In [31]:
routes_df['route_time'] = routes_df['dt_insert_utc_last'] - routes_df['dt_insert_utc_first']

routes_df['distance'] = [geopy.distance.geodesic([latitude_first, longitude_first], [latitude_last, longitude_last]).km
                         for (latitude_first, longitude_first, latitude_last, longitude_last) in
                         zip(routes_df['latitude_first'], routes_df['longitude_first'], routes_df['latitude_last'], routes_df['longitude_last'])]

In [36]:
# Lots of invalid routes !
routes_df.head()

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance
13,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-18 08:59:04,43.981573,44.032652,26.121168,26.395267,11,0 days 02:02:19,22.702591
15,2609076,3,CONSTANTA,2019-03-18 09:19:25,2019-03-18 10:53:45,44.040135,44.081035,26.452663,26.711707,12,0 days 01:34:20,21.247413
17,2609076,5,CONSTANTA,2019-03-18 11:15:04,2019-03-22 23:45:55,44.078045,44.120152,26.767333,28.644393,431,4 days 12:30:51,150.370561
18,2609076,6,CORABIA,2019-03-23 00:01:54,2019-03-23 00:59:01,44.120182,44.120142,28.644377,28.644448,5,0 days 00:57:07,0.007257
20,2609076,8,CORABIA,2019-03-23 01:26:02,2019-03-25 14:54:12,44.120112,43.985517,28.644448,26.159403,161,2 days 13:28:10,199.692188


In [35]:
routes_df = routes_df[routes_df['distance'] != 0]
routes_df.head()

Unnamed: 0,mmsi,route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,route_count,route_time,distance
13,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-18 08:59:04,43.981573,44.032652,26.121168,26.395267,11,0 days 02:02:19,22.702591
15,2609076,3,CONSTANTA,2019-03-18 09:19:25,2019-03-18 10:53:45,44.040135,44.081035,26.452663,26.711707,12,0 days 01:34:20,21.247413
17,2609076,5,CONSTANTA,2019-03-18 11:15:04,2019-03-22 23:45:55,44.078045,44.120152,26.767333,28.644393,431,4 days 12:30:51,150.370561
18,2609076,6,CORABIA,2019-03-23 00:01:54,2019-03-23 00:59:01,44.120182,44.120142,28.644377,28.644448,5,0 days 00:57:07,0.007257
20,2609076,8,CORABIA,2019-03-23 01:26:02,2019-03-25 14:54:12,44.120112,43.985517,28.644448,26.159403,161,2 days 13:28:10,199.692188


### Build new routes after removal

In [37]:
routes_df['destination_lag'] = routes_df['destination'].shift(1)

routes_df['new_route'] =  routes_df.groupby('mmsi').apply(
    lambda x: (x['destination'] != x['destination_lag']).cumsum()
).reset_index(level=0, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  routes_df['destination_lag'] = routes_df['destination'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  routes_df['new_route'] =  routes_df.groupby('mmsi').apply(


In [39]:
new_routes_df = routes_df.groupby(['mmsi', 'new_route', 'destination']).agg(
    {
        'dt_insert_utc_first': 'first',
        'dt_insert_utc_last': 'last',
        'latitude_first': 'first',
        'latitude_last': 'last',
        'longitude_first': 'first',
        'longitude_last': 'last',
    }
).reset_index()

new_routes_df['origin'] = new_routes_df.groupby('mmsi')['destination'].shift(1)

new_routes_df.head()

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,"


### Subset data to proper daterange (i.e. remove previous traces)

In [44]:
new_routes_df['reference_week_first_dt'] = (new_routes_df['dt_insert_utc_first'] > start_date) & (new_routes_df['dt_insert_utc_first'] <= end_date)
new_routes_df['reference_week_last_dt'] = (new_routes_df['dt_insert_utc_last'] > start_date) & (new_routes_df['dt_insert_utc_last'] <= end_date)

In [45]:
new_routes_df['next_destination'] = new_routes_df.groupby('mmsi')['destination'].shift(-1)
new_routes_df.head()

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,reference_week_first_dt,reference_week_last_dt,next_destination
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,,False,False,CORABIA
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA,False,False,CONSTANTA
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA,False,True,"HARSOVA ,"
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA,True,True,CONSTANTA
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,",True,False,ZIMNICEA


### Enrich with IHS data

In [46]:
# ship_data_enriched = fc.create_ship_data_enriched()

In [47]:
new_routes_df_enriched = new_routes_df.merge(
    ship_data_enriched,
    left_on='mmsi',
    right_on="MaritimeMobileServiceIdentityMMSINumber",
    how='left'
)
new_routes_df_enriched

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,...,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,,...,,,,,,,,,,
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA,...,,,,,,,,,,
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA,...,,,,,,,,,,
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA,...,,,,,,,,,,
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24122,764163140,19,MIDIA>>,2019-03-27 05:21:20,2019-04-01 16:20:46,44.006090,44.324732,26.220532,28.627715,RUSE>>,...,,,,,,,,,,
24123,863227588,1,OPL KAVKAZ,2019-04-01 05:36:27,2019-04-01 08:57:48,45.373048,45.012038,36.662702,36.546737,,...,,,,,,,,,,
24124,926416334,1,NOVI-SAD,2019-03-17 13:43:48,2019-03-26 23:55:10,48.979617,45.223505,12.039143,19.667515,,...,,,,,,,,,,
24125,926416334,2,CONSTANTA,2019-03-27 00:05:14,2019-04-04 17:19:24,45.230195,44.112117,19.697080,28.643443,NOVI-SAD,...,,,,,,,,,,


### Impute missing values

In [48]:
new_routes_df_enriched['ShipTypeLevel3'] = new_routes_df_enriched['ShipTypeLevel3'].fillna(new_routes_df_enriched.ShipTypeLevel3.mode().iloc[0])
new_routes_df_enriched['GrossTonnage'] = new_routes_df_enriched['GrossTonnage'].fillna(new_routes_df_enriched.GrossTonnage.mean())
new_routes_df_enriched['NetTonnage'] = new_routes_df_enriched['NetTonnage'].fillna(new_routes_df_enriched.NetTonnage.mean())

In [49]:
new_routes_df_enriched.head()

Unnamed: 0,mmsi,new_route,destination,dt_insert_utc_first,dt_insert_utc_last,latitude_first,latitude_last,longitude_first,longitude_last,origin,...,ShipTypeLevel4,Level3Code,ShipTypeLevel3,Level2Code,ShipTypeLevel2,ShipTypeLevel1Code,ShiptypeLevel1,HullType,SubGroup,SubType
0,2609076,1,CONSTANTA,2019-03-18 06:56:45,2019-03-22 23:45:55,43.981573,44.120152,26.121168,28.644393,,...,,,General Cargo,,,,,,,
1,2609076,2,CORABIA,2019-03-23 00:01:54,2019-03-25 14:54:12,44.120182,43.985517,28.644377,26.159403,CONSTANTA,...,,,General Cargo,,,,,,,
2,2609076,3,CONSTANTA,2019-03-29 09:51:04,2019-04-02 10:07:06,44.098645,44.119512,26.807175,28.645132,CORABIA,...,,,General Cargo,,,,,,,
3,2609076,4,"HARSOVA ,",2019-04-02 10:17:05,2019-04-05 08:39:44,44.117685,44.261387,28.646928,28.187223,CONSTANTA,...,,,General Cargo,,,,,,,
4,2609076,5,CONSTANTA,2019-04-05 08:50:45,2019-04-08 07:58:31,44.259117,44.120847,28.209845,28.644707,"HARSOVA ,",...,,,General Cargo,,,,,,,


### Count departures and arrivals to ports in our polygon

In [50]:
departure_counts = pd.DataFrame(
    new_routes_df_enriched[new_routes_df_enriched.reference_week_first_dt].groupby(['origin', 'destination', 'ShipTypeLevel3']).agg(
        {
            'mmsi': 'count',
            'GrossTonnage': 'sum',
            'NetTonnage': 'sum'
        }
    )
).reset_index().rename(
    columns={'mmsi': 'count',
             'origin': 'port'}
)

In [51]:
arrival_counts = pd.DataFrame(
    new_routes_df_enriched[new_routes_df_enriched.reference_week_last_dt].groupby(['destination', 'origin', 'ShipTypeLevel3']).agg(
        {
            'mmsi': 'count',
            'GrossTonnage': 'sum',
            'NetTonnage': 'sum'
        }
    )
).reset_index().rename(
    columns={'mmsi': 'count',
             'destination': 'port'}
)