# Choose a Kernel
- Please wait 4-5 minutes for the kernel to initialize properly
- Keep checking kernel status at the bottom (changes from Initializing to Idle state)
- Rename notebook and start coding ...

# Reading AIS Data
- write your own code to access AIS data (might show you in Workshop) or
- Import AIS package from GitLab (recommended) 
    - get_ais()
    - access GitLab using a username and token (shown below)

In [1]:
!pip install pyarrow==10.0.0

Collecting pyarrow==10.0.0
  Downloading pyarrow-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 48.3 MB/s eta 0:00:01
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 3.0.0
    Uninstalling pyarrow-3.0.0:
      Successfully uninstalled pyarrow-3.0.0
Successfully installed pyarrow-10.0.0
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
#allow multiple outputs in one jupyter cell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
from datetime import datetime
# to apply aggregation functions on spark df
import pyspark.sql.functions as F
from pyarrow import fs
import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
# this cell contains the code to access GitLab repo
# need it to install ais package from GitLab repo
import sys
import subprocess

GITLAB_USER = "read_aistt"  # read only access
GITLAB_TOKEN = "MMQ6ky1rnLsuKxjyZuvB"

# clone the repo and install the ais packag
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True).stdout
print(std_out)

Collecting git+https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-ztr7ozg1
Building wheels for collected packages: ais
  Building wheel for ais (setup.py): started
  Building wheel for ais (setup.py): finished with status 'done'
  Created wheel for ais: filename=ais-2.7.6-py3-none-any.whl size=9267 sha256=0a4a0e032c4d56337140c3f045a389c4cc4d960e45c30b2cf56e766b9de4bfa6
  Stored in directory: /tmp/pip-ephem-wheel-cache-mtrkq0dj/wheels/49/e0/a2/25d96a62cf626776ab2fd57fcbd822c2b8118049a84b16953d
Successfully built ais
Installing collected packages: ais
Successfully installed ais-2.7.6



In [4]:
import getpass

AWS_ACCESS_KEY_ID=getpass.getpass("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY=getpass.getpass("AWS_SECRET_ACCESS_KEY")
AWS_SESSION_TOKEN=getpass.getpass("AWS_SESSION_TOKEN")
AWS_S3_ENDPOINT="minio.lab.sspcloud.fr"

s3 = fs.S3FileSystem(endpoint_override=AWS_S3_ENDPOINT,
                     access_key=AWS_ACCESS_KEY_ID, 
                     secret_key=AWS_SECRET_ACCESS_KEY, 
                     session_token=AWS_SESSION_TOKEN)

BUCKET_OUT = "projet-hackathon-un-2022"

AWS_ACCESS_KEY_ID ····················
AWS_SECRET_ACCESS_KEY ········································
AWS_SESSION_TOKEN ················································································································································································································································································································································································································································································································································································································································································································································································································································································································

# Using get_ais()
- retrieve data for a single date
- filter data on date and specific columns
- filter data for a range of dates
- filter data based on mmsi (unique ship identifier)
- fiter data based on a geographical polygon

In [5]:
# import get_ais() from ais package
from ais import functions as af

### Example 5: Filter data based on geolocation polygon



In [None]:
# Set coordinates of the selected polygons in geojson format
# https://boundingbox.klokantech.com/
AREA = "azov_black"

if AREA == "azov":
    bb = [[32.4143284746,45.0048840974],[40.0827855058,45.0048840974],[40.0827855058,47.9395951189],[32.4143284746,47.9395951189],[32.4143284746,45.0048840974]]
elif AREA == "azov_black":
    bb = [[43.3308500839,39.9913666442],[26.1506878922,41.33737686],[27.1872912828,48.4341912681],[44.3674534746,47.2431326615],[43.3308500839,39.9913666442]]

polygon = {
        "type": "Polygon",
        "coordinates": [bb]
    }

polygon_hex_df = af.polygon_to_hex_df([("Polygon", polygon)])

# Filter boats that were at least 1 time in our polygon

start_date = datetime.fromisoformat("2022-04-01")
end_date = datetime.fromisoformat("2022-04-08")
columns = ["mmsi", "latitude", "longitude", "eeid", "dt_insert_utc", "destination"]

# pass polygon_hex_df to get_ais()
df_ais_polygon = af.get_ais(spark,
                            start_date, 
                            end_date = end_date,
                            columns = columns,
                            polygon_hex_df = polygon_hex_df
                           )

# ais_polygon.count()
# ais_polygon.show(n=5)

# Get full traces of boats to get areas of origin and destination

# Get list of boats in our polygon
unique_mmsi_polygon = df_ais_polygon.select(F.col("mmsi")).distinct().toPandas()["mmsi"].tolist()

# Buffers to ensure getting origin and destination
start_date_buffer = datetime.fromisoformat("2019-03-25")
end_date_buffer = datetime.fromisoformat("2019-04-14")

# Get full traces of all boats that were at least once in our polygon
df_full_traces = af.get_ais(spark,
start_date_buffer,
end_date = end_date_buffer,
columns = columns,
mmsi_list = unique_mmsi_polygon
)

# Export data to S3
start_date_str = start_date.strftime("%Y%m%d")
end_date_str = end_date.strftime("%Y%m%d")

table = pa.Table.from_pandas(df_full_traces.toPandas())
pq.write_table(table, f"projet-hackathon-un-2022/AIS/ais_{AREA}_{start_date_str}_{end_date_str}_full_traces.parquet", 
               filesystem=s3)

# Accessing IHS Data 
- ship registry data in s3
- includes details about ship on a very granular level

## Ship Data

In [23]:
basepath = "s3a://ungp-ais-data-historical-backup/register/"

# first file 
df_ship_data = spark.read.load(basepath+ "ShipData.CSV", 
                               format="csv", sep=",", inferSchema="true", header="true")

In [24]:
# df_ship_data.printSchema()

In [25]:
table = pa.Table.from_pandas(df_ship_data.toPandas())
pq.write_table(table, f"{BUCKET_OUT}/IHS/ship_data.parquet", filesystem=s3)

## Ship Codes

In [26]:
# second file read ship codes
df_ship_code = spark.read.load(basepath + "tblShipTypeCodes.CSV", 
                     format="csv", sep=",", inferSchema="true", header="true")

In [27]:
# df_ship_code.printSchema()

In [28]:
table = pa.Table.from_pandas(df_ship_code.toPandas())
pq.write_table(table, f"{BUCKET_OUT}/IHS/ship_codes.parquet", filesystem=s3)

In [23]:
spark.stop()