In [1]:
#allow multiple outputs in one jupyter cell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
from datetime import datetime
# to apply aggregation functions on spark df
import pyspark.sql.functions as F
from pyarrow import fs
import pyarrow as pa
import pyarrow.parquet as pq

# Prepare env for reading AIS data

In [2]:
# this cell contains the code to access GitLab repo
# need it to install ais package from GitLab repo
import sys
import subprocess

GITLAB_USER = "read_aistt"  # read only access
GITLAB_TOKEN = "MMQ6ky1rnLsuKxjyZuvB"

# clone the repo and install the ais packag
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True).stdout
print(std_out)

Collecting git+https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-8hak41em
Building wheels for collected packages: ais
  Building wheel for ais (setup.py): started
  Building wheel for ais (setup.py): finished with status 'done'
  Created wheel for ais: filename=ais-2.7.6-py3-none-any.whl size=9267 sha256=1c1d60d8465a072d6bfd803f89f1b1444b719ae6e7e0021d3f103ba57f489593
  Stored in directory: /tmp/pip-ephem-wheel-cache-utwu3k21/wheels/49/e0/a2/25d96a62cf626776ab2fd57fcbd822c2b8118049a84b16953d
Successfully built ais
Installing collected packages: ais
Successfully installed ais-2.7.6



In [5]:
# import get_ais() from ais package
from ais import functions as af

# Read IHS data

In [None]:
basepath = "s3a://ungp-ais-data-historical-backup/register/"

# first file 
df_ihs = spark.read.load(basepath+ "ShipData.CSV", 
                     format="csv", sep=",", inferSchema="true", header="true")
df_ihs.show(1)

# scenario 1

We want to know the percentage of the ship number of a certain day of the whole year. 
- get the ship number of the whole data set
- get the ship number of the given day
- get the percentage

In [None]:
# get the dataset of the whole year 2022
start_date = datetime.fromisoformat("2022-01-01")
end_date = datetime.fromisoformat("2022-08-30")
columns = ["mmsi","dt_insert_utc"]

# pass polygon_hex_df to get_ais()
df_full_ais_2022 = af.get_ais(spark,
                            start_date, 
                            end_date = end_date,
                            columns = columns,
                           )

df_full_ais_2022.count()

In [54]:
# df_all should be a data frame return by the af.get_ais() with certain filter condition

def get_ship_nb_percentage_of_day(df_all,target_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    total_ship_number=df_all.select(ship_id_col).distinct().count()
    ship_number_of_day=df_all.withColumn("date",F.to_date(date_col)).filter(F.col("date") == F.lit(target_date)).select(ship_id_col).distinct().count()
    return (ship_number_of_day/total_ship_number)*100

In [None]:
# df_all should be a data frame return by the af.get_ais() with certain fi

def get_ship_nb_percentage_of_period(df_all,start_date,end_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    total_ship_number=df_all.select(ship_id_col).distinct().count()
    ship_number_of_day=df_all.withColumn("date",F.to_date(date_col)).filter((F.col("date") >= F.lit(start_date)) &(F.col("date") =< F.lit(end_date))).select(ship_id_col).distinct().count()
    return (ship_number_of_day/total_ship_number)*100

In [55]:
target_date="2022-04-01"
res=get_ship_nb_percentage_of_day(df_full_ais_2022,target_date)
print(res)



71.56470949892407


                                                                                

In [None]:
start_date="2022-04-01"
end_date="2022-04-08"
res=get_ship_nb_percentage_of_period(df_full_ais_2022,start_date,end_date)
print(res)

# scenario 2

We want to know the percentage of the ship tonage of a certain day of the whole data set. 
- get the distinct ship of the whole data set, and sum their tonage 
- get the distinct ship of the given day, and sum their tonage
- get the percentage

In [56]:
def get_ship_tonage_percentage_of_day(df_all,df_ihs,target_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    all_ship_mmsi=df_all.select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    all_ship_tonage=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(all_ship_mmsi)).agg(F.sum("GrossTonnage")).collect()[0][0]
    ship_mmsi_of_day=df_all.withColumn("date",F.to_date(date_col)).filter(F.col("date") == F.lit(target_date)).select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    ship_tonage_of_day=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(ship_mmsi_of_day)).agg(F.sum("GrossTonnage")).collect()[0][0]
    return (ship_tonage_of_day/all_ship_tonage)*100

In [None]:
def get_ship_tonage_percentage_of_period(df_all,df_ihs,start_date,end_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    all_ship_mmsi=df_all.select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    all_ship_tonage=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(all_ship_mmsi)).agg(F.sum("GrossTonnage")).collect()[0][0]
    ship_mmsi_of_day=df_all.withColumn("date",F.to_date(date_col)).filter((F.col("date") >= F.lit(start_date)) & (F.col("date") =< F.lit(end_date))).select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    ship_tonage_of_day=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(ship_mmsi_of_day)).agg(F.sum("GrossTonnage")).collect()[0][0]
    return (ship_tonage_of_day/all_ship_tonage)*100

In [58]:
tonage_percentate=get_ship_tonage_percentage_of_day(df_full_ais_2022,df_ihs,target_date)
print(f"the tonage percentage of day {target_date}: {tonage_percentate}")

                                                                                

71.83509193263018


                                                                                

In [None]:
tonage_percentate_of_period=get_ship_tonage_percentage_of_period(df_full_ais_2022,df_ihs,start_date,end_date)
print(f"the tonage percentage from {start_date} to {end_date}: {tonage_percentate_of_period}")

# scenario 3

We want to know the percentage of the ship number of a certain area (a given polygon) of the whole data set. 
- get the ship number of the whole data set
- get the ship number of the given zone (We filter a zone by their polygon_name)
- get the percentage

In [None]:
# setup general filter condition
start_date = datetime.fromisoformat("2022-04-01")
end_date = datetime.fromisoformat("2022-04-01")
columns = ["mmsi"]

## Calculate percentage for each choke point

We are intrested in three choke point for the moment:
- black and azov sea area
- canal suez
- Strait of Gallipoli (Turkish)

### Stats of the black and azov sea area

We build the polygon by using this [site](https://boundingbox.klokantech.com/). Below figure gives you an idea the shape of the polygon

![black_azov_sea_polygon.PNG](../images/black_azov_sea_polygon.PNG)

#### Ship numbers in the black and azov sea

First, we are intrested in the ship number in the black and azov sea area. Then we want to the percentage compare to the world. We pick two distinct dates
- 2019-04-03 
- 2022-04-03

In [None]:
# main function to calculate ship number percentage of a region compare to the world
def get_region_ship_number_percentage_of_the_world_trafic(df_all,df_region):
    ship_id_col="mmsi"
    total_ship_number=df_all.select(ship_id_col).distinct().count()
    region_ship_number=df_region.select(ship_id_col).distinct().count()
    return (region_ship_number/total_ship_number)*100

In [None]:
# main function to calculate ship tonage percentage of a region compare to the world
def get_region_ship_tonage_percentage_of_the_world_trafic(df_all,df_region,df_ihs):
    ship_id_col="mmsi"
    all_ship_mmsi=df_all.select(ship_id_col).distinct().toPandas()[ship_id_col].tolist()
    all_ship_tonage=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(all_ship_mmsi)).agg(F.sum("GrossTonnage")).collect()[0][0]
    ship_mmsi_of_region=df_region.select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    ship_tonage_of_region=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(ship_mmsi_of_region)).agg(F.sum("GrossTonnage")).collect()[0][0]
    return (ship_tonage_of_region/all_ship_tonage)*100

In [None]:
# Define the area polygon

azov_black_geoj = [[43.3308500839,39.9913666442],[26.1506878922,41.33737686],[27.1872912828,48.4341912681],[44.3674534746,47.2431326615],[43.3308500839,39.9913666442]]

azov_black_polygon = {
        "type": "Polygon",
        "coordinates": [azov_black_geoj]
    }

azov_black_coordinate_df = af.polygon_to_hex_df([("azov_black_polygon",azov_black_polygon)])

#### Stats of Black and Azov sea area of 2019-04-03

In [None]:
# Get the Ais data of the below date
target_date="2019-04-03"

# get world wide AIS
df_all = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns
                           )

# get the AIs data of the black sea and azov sea area
df_azov_black = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns,
                            polygon_hex_df = azov_black_coordinate_df
                           )



In [None]:
# calculate the ship number percentage

azov_black_ship_number_percentage=get_region_ship_number_percentage_of_the_world_trafic(df_all,df_azov_black):
print(f"The azov black sea area ship number percentage of the world trafic: {azov_black_ship_number_percentage}")

In [None]:
# calculate the ship tonage percentage 
azov_black_ship_tonage_percetage=get_region_ship_tonage_percentage_of_the_world_trafic(df_all,df_azov_black,df_ihs)
print(f"The azov black sea area ship tonage percentage of the world trafic: {azov_black_ship_tonage_percetage}")

In [None]:
# Set coordinates of the selected polygons in geojson format
# https://boundingbox.klokantech.com/

azov_black_geoj = [[43.3308500839,39.9913666442],[26.1506878922,41.33737686],[27.1872912828,48.4341912681],[44.3674534746,47.2431326615],[43.3308500839,39.9913666442]]

# choke point polygon
turkish_geoj= [[27.5492385644,40.137141233],[30.3080307823,40.137141233],[30.3080307823,41.8347426536],[27.5492385644,41.8347426536],[27.5492385644,40.137141233]]

turkish_polygon = {
        "type": "Polygon",
        "coordinates": [turkish_geoj]
    }

turkish_coordinate_df = af.polygon_to_hex_df([("turkish_polygon",turkish_polygon)])



In [None]:
# get the turkish choke point percentatge

In [None]:
ship_id_col="mmsi"

# get world wide AIS
df_all = af.get_ais(spark,
                            start_date, 
                            end_date = end_date,
                            columns = columns
                           )
total_ship_number=df_all.select(ship_id_col).distinct().count()

In [None]:
df_turkish = af.get_ais(spark,
                            start_date, 
                            end_date = end_date,
                            columns = columns,
                            polygon_hex_df = turkish_coordinate_df
                           )

turkish_ship_number=df_turkish.select(ship_id_col).distinct().count()

In [None]:
turkish_percentage=(azov_black_ship_number/total_ship_number)*100
print(f"ship number percentage in turkish choke point: {turkish_percentage}")

# scenario 4

We want to know the percentage of the ship tonage of a certain day of the whole data set. 
- get the distinct ship of the whole data set, and sum their tonage 
- get the distinct ship of the given day, and sum their tonage
- get the percentage

In [None]:

print(all_ship_tonage)

In [None]:

print(ship_tonage_of_turkish)

# helper function get mmsi list by day

We want to get the mmsi list of a dataset day by day

In [68]:
def get_distinct_mmsi_by_day(df_all):
    return df_all.withColumn("date",F.to_date(date_col)).groupBy("date").agg(F.collect_set("mmsi").alias("distinct_mmsi_by_day")).orderBy("date")

In [69]:
df_mmsi_by_day=get_distinct_mmsi_by_day(df_ais)
df_mmsi_by_day.show()



+----------+--------------------+
|      date|distinct_mmsi_by_day|
+----------+--------------------+
|2022-01-01|[264163452, 27333...|
|2022-01-02|[273332380, 26416...|
|2022-01-03|[264163452, 27333...|
|2022-01-04|[264163452, 27333...|
|2022-01-05|[271048605, 27333...|
|2022-01-06|[273332380, 21512...|
|2022-01-07|[273332380, 27104...|
+----------+--------------------+



                                                                                