# Choose a Kernel
- Please wait 4-5 minutes for the kernel to initialize properly
- Keep checking kernel status at the bottom (changes from Initializing to Idle state)
- Rename notebook and start coding ...

# Reading AIS Data
- write your own code to access AIS data (might show you in Workshop) or
- Import AIS package from GitLab (recommended) 
    - get_ais()
    - access GitLab using a username and token (shown below)

In [1]:
!pip install pyarrow==10.0.0 s3fs

Collecting pyarrow==10.0.0
  Downloading pyarrow-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 18.6 MB/s eta 0:00:01
[?25hCollecting s3fs
  Downloading s3fs-2022.10.0-py3-none-any.whl (27 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1
  Downloading aiohttp-3.8.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 71.2 MB/s eta 0:00:01
[?25hCollecting fsspec==2022.10.0
  Downloading fsspec-2022.10.0-py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 76.6 MB/s eta 0:00:01
[?25hCollecting aiobotocore~=2.4.0
  Downloading aiobotocore-2.4.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 59.9 MB/s eta 0:00:01
[?25hCollecting wrapt>=1.10.10
  Downloading wrapt-1.14.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (81 kB)
[K     |████████████████████████

In [2]:
#allow multiple outputs in one jupyter cell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
from datetime import datetime
# to apply aggregation functions on spark df
import pyspark.sql.functions as F
from pyarrow import fs
import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
# this cell contains the code to access GitLab repo
# need it to install ais package from GitLab repo
import sys
import subprocess

GITLAB_USER = "read_aistt"  # read only access
GITLAB_TOKEN = "MMQ6ky1rnLsuKxjyZuvB"

# clone the repo and install the ais packag
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True).stdout
print(std_out) 

Collecting git+https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-dhxote0v
Building wheels for collected packages: ais
  Building wheel for ais (setup.py): started
  Building wheel for ais (setup.py): finished with status 'done'
  Created wheel for ais: filename=ais-2.7.6-py3-none-any.whl size=9267 sha256=9a450da562a289146a720fc0e4ab4abcdfaf86ae63e929115c3e0579c5b47422
  Stored in directory: /tmp/pip-ephem-wheel-cache-dgz0_i43/wheels/49/e0/a2/25d96a62cf626776ab2fd57fcbd822c2b8118049a84b16953d
Successfully built ais
Installing collected packages: ais
Successfully installed ais-2.7.6



In [4]:
import getpass

AWS_ACCESS_KEY_ID=getpass.getpass("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY=getpass.getpass("AWS_SECRET_ACCESS_KEY")
AWS_SESSION_TOKEN=getpass.getpass("AWS_SESSION_TOKEN")
AWS_S3_ENDPOINT="minio.lab.sspcloud.fr"

AWS_ACCESS_KEY_ID ····················
AWS_SECRET_ACCESS_KEY ········································
AWS_SESSION_TOKEN ················································································································································································································································································································································································································································································································································································································································································································································································································································································································

# Using get_ais()
- retrieve data for a single date
- filter data on date and specific columns
- filter data for a range of dates
- filter data based on mmsi (unique ship identifier)
- fiter data based on a geographical polygon

In [5]:
# import get_ais() from ais package
from ais import functions as af

### Example 5: Filter data based on geolocation polygon



In [6]:
# first this function and then pass on its output with get_ais()
af.polygon_to_hex_df?

[0;31mSignature:[0m
[0maf[0m[0;34m.[0m[0mpolygon_to_hex_df[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpolygons[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mTuple[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mDict[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhex_resolution[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moverfill[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
A wrapper for h3.polyfill that returns integer hex ids for multiple polygons.

Parameters
----------
polygons: list of tuples
    the first element in this tuple is expected to be a (name) string
    identifier for the polygon and the second element is the polygon itself (see example above)
    
hex_resolution: int, default 8
    the resolution of the hex

![alternatvie text](https://drive.google.com/uc?export=view&id=1PxMJuKiC5Wi1a7WH0yCWXpG9sajLPdXa)


In [7]:

# 2nd parameter for polygon_to_hex_df() 
    #	https://boundingbox.klokantech.com/
# polygon coordinates in geojson format

AREA = "azov_black"

if AREA == "azov":
    bb = [[32.4143284746,45.0048840974],[40.0827855058,45.0048840974],[40.0827855058,47.9395951189],[32.4143284746,47.9395951189],[32.4143284746,45.0048840974]]
elif AREA == "azov_black":
    bb = [[43.3308500839,39.9913666442],[26.1506878922,41.33737686],[27.1872912828,48.4341912681],[44.3674534746,47.2431326615],[43.3308500839,39.9913666442]]

polygon = {
        "type": "Polygon",
        "coordinates": [bb]
    }

In [8]:
# first parameter for polygon_to_hex_df() is the name/label for the polygon
polygon_hex_df = af.polygon_to_hex_df([("Polygon", polygon)])

In [9]:
start_date = datetime.fromisoformat("2022-01-01")
end_date = datetime.fromisoformat("2022-01-07")
columns = ["mmsi", "latitude", "longitude", "eeid", "dt_insert_utc", "destination"]

# pass polygon_hex_df to get_ais()
df = af.get_ais(spark,
                start_date, 
                end_date = end_date,
                columns = columns,
                polygon_hex_df = polygon_hex_df
               )

df.count()

1543295

In [11]:
# ais messages captured in the Colombo port region
df.show(n=5)

+--------------+-----------+---------+-----------+-------------------+------------------+-----------+-------------------+------------+
|hex_resolution|  longitude|     mmsi|destination|               eeid|    H3_int_index_8|   latitude|      dt_insert_utc|polygon_name|
+--------------+-----------+---------+-----------+-------------------+------------------+-----------+-------------------+------------+
|             8|31.40833333|272157700|   MYKOLAIV|5191743282127358980|613021963599740927|47.52666667|2022-01-01 05:58:49|     Polygon|
|             8|31.33166667|272157700|   MYKOLAIV|5191743282127358980|613021966030340095|     47.535|2022-01-01 18:25:52|     Polygon|
|             8|31.33166667|272157700|   MYKOLAIV|5191743282127358980|613021966030340095|     47.535|2022-01-01 21:14:30|     Polygon|
|             8|31.33333333|272157700|   MYKOLAIV|5191743282127358980|613021966030340095|     47.535|2022-01-01 07:46:57|     Polygon|
|             8|31.33333333|272157700|   MYKOLAIV|51917

In [13]:
start_date_str = start_date.strftime("%Y%M%d")
end_date_str = end_date.strftime("%Y%M%d")

s3 = fs.S3FileSystem(endpoint_override=AWS_S3_ENDPOINT,
                     access_key=AWS_ACCESS_KEY_ID, 
                     secret_key=AWS_SECRET_ACCESS_KEY, 
                     session_token=AWS_SESSION_TOKEN)
table = pa.Table.from_pandas(df.toPandas())
pq.write_table(table, f"projet-hackathon-un-2022/AIS/ais_{AREA}_{start_date_str}_{end_date_str}.parquet", filesystem=s3)

# Accessing IHS Data 
- ship registry data in s3
- includes details about ship on a very granular level

In [21]:
import s3fs
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': AWS_S3_ENDPOINT},
                       key=AWS_ACCESS_KEY_ID, 
                       secret=AWS_SECRET_ACCESS_KEY
                       token=AWS_SESSION_TOKEN)
BUCKET_OUT = "projet-hackathon-un-2022"

In [9]:
basepath = "s3a://ungp-ais-data-historical-backup/register/"

# first file 
df_ship_data = spark.read.load(basepath+ "ShipData.CSV", 
                               format="csv", sep=",", inferSchema="true", header="true")

In [24]:
# df_ship_data.printSchema()

In [10]:
# select only relevant cols from spark df
print('Loading ShipData.CSV (few cols) .....')
ship_data = df_ship_data.select("StatCode5", "MaritimeMobileServiceIdentityMMSINumber", "ShipStatusEffectiveDate",
                               "ShiptypeLevel5", "LRIMOShipNo", "FuelConsumptionTotal", "GrossTonnage", "NetTonnage").toPandas()

ship_data.shape
ship_data.head()

FILE_KEY_OUT_S3 = "IHS/ship_data.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    ship_data.to_csv(file_out, index=False)

Loading ShipData.CSV (few cols) .....


(246724, 8)

Unnamed: 0,StatCode5,MaritimeMobileServiceIdentityMMSINumber,ShipStatusEffectiveDate,ShiptypeLevel5,LRIMOShipNo,FuelConsumptionTotal,GrossTonnage,NetTonnage
0,X11A2YP,,19610000,Yacht,1000019,0.0,551,165
1,X11A2YP,,19951000,Yacht,1000021,0.0,1980,588
2,X11A2YP,234028000.0,19950512,Yacht,1000033,0.0,178,53
3,X11A2YP,239488000.0,19950429,Yacht,1000045,0.0,264,79
4,X11A2YP,,20220601,Yacht,1000057,0.0,234,70


In [None]:
# second file read ship codes
df_ship_code = spark.read.load(basepath + "tblShipTypeCodes.CSV", 
                     format="csv", sep=",", inferSchema="true", header="true")

df_ship_code.printSchema()

# select only relevant cols from spark df
ship_code = df_ship_code.select("StatCode5", "ShipTypeLevel1", "ShipTypeLevel2", "ShipTypeLevel3", "ShipTypeLevel4", "ShipTypeLevel5", 
                                "SubGroup", "SubType").toPandas()

print('Loading tblShipTypeCodes.csv (few cols) ....')
ship_code.shape
ship_code.head()

In [23]:
FILE_KEY_OUT_S3 = "IHS/ship_codes.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    ship_code.to_csv(file_out, index=False)