# United Nations - Big Data Hackathon

In [None]:
#allow multiple outputs in one jupyter cell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
from datetime import datetime, timedelta
# to apply aggregation functions on spark df
import pyspark.sql.functions as F

In [None]:
# this cell contains the code to access GitLab repo
# need it to install ais package from GitLab repo
import sys
import subprocess

GITLAB_USER = "read_aistt"  # read only access
GITLAB_TOKEN = "MMQ6ky1rnLsuKxjyZuvB"

# clone the repo and install the ais packag
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True).stdout
print(std_out)

## Data extraction from AIS/IHS

**This section consists in PySpark code, to be run on the UNBD platform.**

### Extract AIS data for selected areas and timeranges

In [None]:
# import get_ais() from ais package
from ais import functions as af

In [5]:
def get_full_traces_ais(area, start_date, end_date, to_pandas=True):
    ## Set coordinates of the selected polygons in geojson format
    # https://boundingbox.klokantech.com/
    if area == "azov":
        bb = [[32.4143284746,45.0048840974],[40.0827855058,45.0048840974],[40.0827855058,47.9395951189],[32.4143284746,47.9395951189],[32.4143284746,45.0048840974]]
    elif area == "azov_black":
        bb = [[43.3308500839,39.9913666442],[26.1506878922,41.33737686],[27.1872912828,48.4341912681],[44.3674534746,47.2431326615],[43.3308500839,39.9913666442]]
    elif area == "suez":
        bb = [[33.979966859,27.3216575046],[31.4640977184,27.3314178101],[31.4860703747,30.787762868],[33.9689805309,30.7972004213],[33.979966859,27.3216575046]]

    polygon = {
            "type": "Polygon",
            "coordinates": [bb]
        }

    polygon_hex_df = af.polygon_to_hex_df([("Polygon", polygon)])
    
    ## Filter boats that were at least 1 time in our polygon

    start_date = datetime.fromisoformat(start_date)
    end_date = datetime.fromisoformat(end_date)
    columns = ["mmsi", "latitude", "longitude", "eeid", "dt_insert_utc", "destination"]
    df_ais_polygon = af.get_ais(spark,
                                start_date, 
                                end_date = end_date,
                                columns = columns,
                                polygon_hex_df = polygon_hex_df
                               )
    
    ## Get full traces of boats to get areas of origin

    # Get list of boats in our polygon
    unique_mmsi_polygon = df_ais_polygon.select(F.col("mmsi")).distinct().toPandas()["mmsi"].tolist()

    # Buffers to ensure getting proper origin
    start_date_buffer = start_date + timedelta(days=-15)

    # Get full traces of all boats that were at least once in our polygon
    df_full_traces = af.get_ais(spark,
    start_date_buffer,
    end_date = end_date,
    columns = columns,
    mmsi_list = unique_mmsi_polygon
    )
    
    if to_pandas:
        df_full_traces = df_full_traces.toPandas()
    
    return df_full_traces

In [None]:
df_suez_reference = get_full_traces_ais(area="suez", start_date="2019-04-01", end_date="2019-04-08")
df_suez_choke = get_full_traces_ais(area="suez", start_date="2021-03-21", end_date="2021-04-01")

In [None]:
df_black_azov_reference = get_full_traces_ais(area="azov_black", start_date="2019-04-01", end_date="2019-04-08")
df_black_azov_choke = get_full_traces_ais(area="azov_black", start_date="2022-04-01", end_date="2022-04-08")

### Enrich AIS data with information on ships from IHS data

In [None]:
def enrich_ais(df_ais, to_pandas=True):
    """Enrich AIS data using IHS data."""
    # Load and merge IHS data
    basepath = "s3a://ungp-ais-data-historical-backup/register/"
    df_ship_data_ihs = spark.read.load(basepath+ "ShipData.CSV", format="csv", sep=",", 
                                       inferSchema="true", header="true")
    df_ship_codes_ihs = spark.read.load(basepath + "tblShipTypeCodes.CSV", format="csv", sep=",",
                                        inferSchema="true", header="true")
    if to_pandas:
        df_ship_data_ihs = df_ship_data_ihs.toPandas()
        df_ship_codes_ihs = df_ship_codes_ihs.toPandas()
    df_ship_data_enriched = df_ship_data_ihs.merge(df_ship_codes_ihs, on="StatCode5")
    
    # Enrich AIS data
    df_ais_enriched = df_ais.merge(df_ship_data_enriched, left_on="mmsi", right_on="MaritimeMobileServiceIdentityMMSINumber")
    
    return df_ais_enriched

In [6]:
# Enrich AIS data

df_suez_reference = enrich_ais(df_ais=df_suez_reference)
df_suez_choke = enrich_ais(df_ais=df_suez_choke)

df_black_azov_reference = enrich_ais(df_ais=df_black_azov_reference)
df_black_azov_choke = enrich_ais(df_ais=df_black_azov_choke)

In [None]:
spark.stop()

## Data extraction from AIS/IHS