In [7]:
#allow multiple outputs in one jupyter cell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
from datetime import datetime
# to apply aggregation functions on spark df
import pyspark.sql.functions as F
from pyarrow import fs
import pyarrow as pa
import pyarrow.parquet as pq

# Prepare env for reading AIS data

In [2]:
# this cell contains the code to access GitLab repo
# need it to install ais package from GitLab repo
import sys
import subprocess

GITLAB_USER = "read_aistt"  # read only access
GITLAB_TOKEN = "MMQ6ky1rnLsuKxjyZuvB"

# clone the repo and install the ais packag
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True).stdout
print(std_out)

Collecting git+https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-3gax024q
Building wheels for collected packages: ais
  Building wheel for ais (setup.py): started
  Building wheel for ais (setup.py): finished with status 'done'
  Created wheel for ais: filename=ais-2.7.6-py3-none-any.whl size=9267 sha256=0bfed0f0e6a25e53c91033c7c535420ee4662cd8a1908b96b82c76da01cbc886
  Stored in directory: /tmp/pip-ephem-wheel-cache-dlzlx2sz/wheels/49/e0/a2/25d96a62cf626776ab2fd57fcbd822c2b8118049a84b16953d
Successfully built ais
Installing collected packages: ais
Successfully installed ais-2.7.6



In [3]:
# import get_ais() from ais package
from ais import functions as af

# Read IHS data

In [4]:
basepath = "s3a://ungp-ais-data-historical-backup/register/"

# first file 
df_ihs = spark.read.load(basepath+ "ShipData.CSV", 
                     format="csv", sep=",", inferSchema="true", header="true")
df_ihs.show(1)

+-----------+---------+-------------------------------+------------------+---------+--------------+---------------------------------------+-------------------+------------------------------------+--------------------------------+----------------------+--------------------------------+--------------------------------+-------------------------------+-----------------------------------------+-------------------------------------+-------------------+-----------------------------+-----------------------------+--------+------------------+-------------------------+------------+----------+----------------+------------------------------+----------------+-----------+----------------------+----------+-------------+-------------------------+-------------------------+---------------+-------------+-------------+-----------------+---+-------------------+-----------------+----------+---------------------+------------------+---------------+-----+--------+------------+--------------------+--------+-----

# scenario 1

We want to know the percentage of the ship number of a certain day of the whole year. 
- get the ship number of the whole data set
- get the ship number of the given day
- get the percentage

In [None]:
# get the dataset of the whole year 2022
start_date = datetime.fromisoformat("2022-01-01")
end_date = datetime.fromisoformat("2022-08-30")
columns = ["mmsi","dt_insert_utc"]

# pass polygon_hex_df to get_ais()
df_full_ais_2022 = af.get_ais(spark,
                            start_date, 
                            end_date = end_date,
                            columns = columns,
                           )

df_full_ais_2022.count()

In [54]:
# df_all should be a data frame return by the af.get_ais() with certain filter condition

def get_ship_nb_percentage_of_day(df_all,target_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    total_ship_number=df_all.select(ship_id_col).distinct().count()
    ship_number_of_day=df_all.withColumn("date",F.to_date(date_col)).filter(F.col("date") == F.lit(target_date)).select(ship_id_col).distinct().count()
    return (ship_number_of_day/total_ship_number)*100

In [None]:
# df_all should be a data frame return by the af.get_ais() with certain fi

def get_ship_nb_percentage_of_period(df_all,start_date,end_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    total_ship_number=df_all.select(ship_id_col).distinct().count()
    ship_number_of_day=df_all.withColumn("date",F.to_date(date_col)).filter((F.col("date") >= F.lit(start_date)) &(F.col("date") =< F.lit(end_date))).select(ship_id_col).distinct().count()
    return (ship_number_of_day/total_ship_number)*100

In [55]:
target_date="2022-04-01"
res=get_ship_nb_percentage_of_day(df_full_ais_2022,target_date)
print(res)



71.56470949892407


                                                                                

In [None]:
start_date="2022-04-01"
end_date="2022-04-08"
res=get_ship_nb_percentage_of_period(df_full_ais_2022,start_date,end_date)
print(res)

# scenario 2

We want to know the percentage of the ship tonage of a certain day of the whole data set. 
- get the distinct ship of the whole data set, and sum their tonage 
- get the distinct ship of the given day, and sum their tonage
- get the percentage

In [56]:
def get_ship_tonage_percentage_of_day(df_all,df_ihs,target_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    all_ship_mmsi=df_all.select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    all_ship_tonage=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(all_ship_mmsi)).agg(F.sum("GrossTonnage")).collect()[0][0]
    ship_mmsi_of_day=df_all.withColumn("date",F.to_date(date_col)).filter(F.col("date") == F.lit(target_date)).select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    ship_tonage_of_day=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(ship_mmsi_of_day)).agg(F.sum("GrossTonnage")).collect()[0][0]
    return (ship_tonage_of_day/all_ship_tonage)*100

In [None]:
def get_ship_tonage_percentage_of_period(df_all,df_ihs,start_date,end_date):
    ship_id_col="mmsi"
    date_col="dt_insert_utc"
    all_ship_mmsi=df_all.select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    all_ship_tonage=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(all_ship_mmsi)).agg(F.sum("GrossTonnage")).collect()[0][0]
    ship_mmsi_of_day=df_all.withColumn("date",F.to_date(date_col)).filter((F.col("date") >= F.lit(start_date)) & (F.col("date") =< F.lit(end_date))).select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    ship_tonage_of_day=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(ship_mmsi_of_day)).agg(F.sum("GrossTonnage")).collect()[0][0]
    return (ship_tonage_of_day/all_ship_tonage)*100

In [58]:
tonage_percentate=get_ship_tonage_percentage_of_day(df_full_ais_2022,df_ihs,target_date)
print(f"the tonage percentage of day {target_date}: {tonage_percentate}")

                                                                                

71.83509193263018


                                                                                

In [None]:
tonage_percentate_of_period=get_ship_tonage_percentage_of_period(df_full_ais_2022,df_ihs,start_date,end_date)
print(f"the tonage percentage from {start_date} to {end_date}: {tonage_percentate_of_period}")

## Calculate percentage for each choke point

We are intrested in three choke point for the moment:
- black and azov sea area
- canal suez
- Strait of Gallipoli (Turkish)

### Stats of the black and azov sea area

We build the polygon by using this [site](https://boundingbox.klokantech.com/). Below figure gives you an idea the shape of the polygon

![black_azov_sea_polygon.PNG](../images/black_azov_sea_polygon.PNG)

#### Ship numbers in the black and azov sea

First, we are intrested in the ship number in the **black and azov sea** area. Then we want to the percentage compare to the world. We pick two distinct dates
- 2019-04-03 
- 2022-04-03

In [8]:
# main function to calculate ship number percentage of a region compare to the world
def get_region_ship_number_percentage_of_the_world_trafic(df_all,df_region):
    ship_id_col="mmsi"
    total_ship_number=df_all.select(ship_id_col).distinct().count()
    print(f"total ship number: {total_ship_number}")
    region_ship_number=df_region.select(ship_id_col).distinct().count()
    print(f"region ship number:  {region_ship_number}")
    return (region_ship_number/total_ship_number)*100

In [9]:
# main function to calculate ship tonage percentage of a region compare to the world
def get_region_ship_tonage_percentage_of_the_world_trafic(df_all,df_region,df_ihs):
    ship_id_col="mmsi"
    all_ship_mmsi=df_all.select(ship_id_col).distinct().toPandas()[ship_id_col].tolist()
    total_ship_tonage=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(all_ship_mmsi)).agg(F.sum("GrossTonnage")).collect()[0][0]
    print(f"total ship tonage: {total_ship_tonage}")
    ship_mmsi_of_region=df_region.select(ship_id_col).distinct().toPandas()['mmsi'].tolist()
    region_ship_tonage=df_ihs.filter(F.col("MaritimeMobileServiceIdentityMMSINumber").isin(ship_mmsi_of_region)).agg(F.sum("GrossTonnage")).collect()[0][0]
    print(f"region ship tonage:  {region_ship_tonage}")
    return (region_ship_tonage/total_ship_tonage)*100

In [7]:
# Define the area polygon

azov_black_geoj = [[43.3308500839,39.9913666442],[26.1506878922,41.33737686],[27.1872912828,48.4341912681],[44.3674534746,47.2431326615],[43.3308500839,39.9913666442]]

azov_black_polygon = {
        "type": "Polygon",
        "coordinates": [azov_black_geoj]
    }

azov_black_coordinate_df = af.polygon_to_hex_df([("azov_black_polygon",azov_black_polygon)])

#### Stats of Black and Azov sea area of 2019-04-03

In [9]:
# Get the Ais data of the below date
target_date="2019-04-03"

columns = ["mmsi"]

# get world wide AIS
df_all = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns
                           )

# get the AIs data of the black sea and azov sea area
df_azov_black = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns,
                            polygon_hex_df = azov_black_coordinate_df
                           )



In [13]:
# calculate the ship number percentage

azov_black_ship_number_percentage=get_region_ship_number_percentage_of_the_world_trafic(df_all,df_azov_black)
print(f"The azov black sea area ship number percentage of the world trafic: {azov_black_ship_number_percentage} %")

173375
2306
The azov black sea area ship number percentage of the world trafic: 1.3300648882480173


In [17]:
# calculate the ship tonage percentage 
azov_black_ship_tonage_percetage=get_region_ship_tonage_percentage_of_the_world_trafic(df_all,df_azov_black,df_ihs)
print(f"The azov black sea area ship tonage percentage of the world trafic: {azov_black_ship_tonage_percetage} %")

total ship tonage: 971864623
region ship number:  8636389
The azov black sea area ship tonage percentage of the world trafic: 0.8886411538821904


#### Stats of Black and Azov sea area of 2022-04-03

In [18]:
# Get the Ais data of the below date
target_date="2022-04-03"

columns = ["mmsi"]

# get world wide AIS
df_all_2022 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns
                           )

# get the AIs data of the black sea and azov sea area
df_azov_black_2022 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns,
                            polygon_hex_df = azov_black_coordinate_df
                           )


In [19]:
# calculate the ship number percentage

azov_black_ship_number_percentage_2022=get_region_ship_number_percentage_of_the_world_trafic(df_all_2022,df_azov_black_2022)
print(f"The azov black sea area ship number percentage of the world trafic: {azov_black_ship_number_percentage_2022}%")

total ship number: 192344
region ship number:  1793
The azov black sea area ship number percentage of the world trafic: 0.932184003660109 %


In [20]:
# calculate the ship tonage percentage 
azov_black_ship_tonage_percetage_2022=get_region_ship_tonage_percentage_of_the_world_trafic(df_all_2022,df_azov_black_2022,df_ihs)
print(f"The azov black sea area ship tonage percentage of the world trafic: {azov_black_ship_tonage_percetage_2022}%")

total ship tonage: 1330424475
region ship number:  11572739
The azov black sea area ship tonage percentage of the world trafic: 0.8698531346546372%


### Stats of the suez canal area

We build the polygon by using this [site](https://boundingbox.klokantech.com/). Below figure gives you an idea the shape of the polygon

![suez_channel_polygon.PNG](../images/suez_channel_polygon.PNG)

#### Ship numbers in suez canal area

First, we are intrested in the ship number in the **suez canal** area. Then we want to the percentage compare to the world. We pick two distinct dates
- 2019-04-03
- 2021-03-26 (incident date)

In [10]:
# Define the suez canal area polygon

suez_geoj = [[33.979966859,27.3216575046],[31.4640977184,27.3314178101],[31.4860703747,30.787762868],[33.9689805309,30.7972004213],[33.979966859,27.3216575046]]
suez_polygon = {
        "type": "Polygon",
        "coordinates": [suez_geoj]
    }

suez_coordinate_df = af.polygon_to_hex_df([("suez_polygon",suez_polygon)])


#### Stats of the suez canal area of 2019-04-03

In [9]:
# Get the Ais data of the below date
target_date="2019-04-03"

columns = ["mmsi"]

# get world wide AIS
df_all_2019 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns
                           )


# get the AIs data of the suez canal area
df_suez_2019 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns,
                            polygon_hex_df = suez_coordinate_df
                           )

In [10]:
# calculate the ship number percentage

suez_ship_number_percentage_2019=get_region_ship_number_percentage_of_the_world_trafic(df_all_2019,df_suez_2019)
print(f"The suez area ship number percentage of the world trafic: {suez_ship_number_percentage_2019}%")

total ship number: 173375
region ship number:  257
The azov black sea area ship number percentage of the world trafic: 0.1482335976928623%


In [11]:
# calculate the ship tonage percentage 
suez_ship_tonage_percetage_2019=get_region_ship_tonage_percentage_of_the_world_trafic(df_all_2019,df_suez_2019,df_ihs)
print(f"The suez area ship tonage percentage of the world trafic: {suez_ship_tonage_percetage_2019}%")

total ship tonage: 971864623
region ship tonage:  6784417
The azov black sea area ship tonage percentage of the world trafic: 0.6980825147290087%


#### Stats of Suez area of 2021-03-26

In [11]:
# Get the Ais data of the below date
target_date="2021-03-26"

columns = ["mmsi"]

# get world wide AIS
df_all_2021 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns
                           )

# get the AIs data of the black sea and azov sea area
df_suez_2021 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns,
                            polygon_hex_df = suez_coordinate_df
                           )

In [13]:
# calculate the ship number percentage

suez_ship_number_percentage_2021=get_region_ship_number_percentage_of_the_world_trafic(df_all_2021,df_suez_2021)
print(f"The suez area ship number percentage of the world trafic: {suez_ship_number_percentage_2021}%")

total ship number: 190735
region ship number:  303
The azov black sea area ship number percentage of the world trafic: 0.15885915012976118%


In [None]:
# calculate the ship tonage percentage 
suez_ship_tonage_percetage_2021=get_region_ship_tonage_percentage_of_the_world_trafic(df_all_2021,df_suez_2021,df_ihs)
print(f"The suez area ship number percentage of the world trafic: {suez_ship_tonage_percetage_2021}%")

total ship tonage: 1169911474
region ship tonage:  10278325


NameError: name 'suez_ship_tonage_percetage_2019' is not defined

### Stats of the Gallipoli (Turkish) area

We build the polygon by using this [site](https://boundingbox.klokantech.com/). Below figure gives you an idea the shape of the polygon

![suez_channel.PNG](../images/turkish.PNG)

#### Ship numbers in the Gallipoli area

First, we are intrested in the ship number in the **Gallipoli** area. Then we want to the percentage compare to the world. We pick two distinct dates
- 2019-04-03
- 2022-04-03

In [None]:

# choke point polygon
turkish_geoj= [[27.5492385644,40.137141233],[30.3080307823,40.137141233],[30.3080307823,41.8347426536],[27.5492385644,41.8347426536],[27.5492385644,40.137141233]]

turkish_polygon = {
        "type": "Polygon",
        "coordinates": [turkish_geoj]
    }

turkish_coordinate_df = af.polygon_to_hex_df([("turkish_polygon",turkish_polygon)])



#### Stats of the suez canal area of 2019-04-03

In [None]:
# Get the Ais data of the below date
target_date="2019-04-03"

columns = ["mmsi"]

# get world wide AIS
df_all_2019 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns
                           )


# get the AIs data of the suez canal area
df_turk_2019 = af.get_ais(spark,
                            start_date = target_date, 
                            end_date = target_date,
                            columns = columns,
                            polygon_hex_df = turkish_coordinate_df
                           )

In [None]:
# calculate the ship number percentage

turk_ship_number_percentage_2019=get_region_ship_number_percentage_of_the_world_trafic(df_all_2019,df_turk_2019)
print(f"The azov black sea area ship number percentage of the world trafic: {turk_ship_number_percentage_2019}%")

In [None]:
spark.stop()