In [1]:
import sys
import subprocess

GITLAB_USER = "read_aistt"  #For use of members of AIS Task Team, read only access
GITLAB_TOKEN = "J1Kk8tArfyXB6dZvFcWW"
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install",git_package], capture_output=True, text=True).stdout
print(std_out)

Collecting git+https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://read_aistt:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-dpz3y1ju
Building wheels for collected packages: ais
  Building wheel for ais (setup.py): started
  Building wheel for ais (setup.py): finished with status 'done'
  Created wheel for ais: filename=ais-2.7.5-py3-none-any.whl size=9195 sha256=5a36072de7569266ce9f2ce570995af1aeabbd7ac995347d8f385064cb559aef
  Stored in directory: /tmp/pip-ephem-wheel-cache-gpqzdmk6/wheels/6d/8c/5e/19898a2b930f8efa2ef2e6ecc8ef48797422e3ec7e0114b312
Successfully built ais
Installing collected packages: ais
Successfully installed ais-2.7.5



In [2]:
from ais import functions as af

#still need to register Sedona even with template configuration. need to check why
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython.display import display
pd.set_option('display.max_columns', None)

import h3
import h3.api.numpy_int as h3int
import pyspark.sql.functions as F

import os
import getpass

In [3]:
# Get clone the repo
try:
    std_out = subprocess.run(["git", "clone", "https://github.com/CSBP-CPSE/ais-data.git"],
                             capture_output=True,
                             text=True)
    print(std_out)
    
    # Create Data folder if it doesn't exist
    if os.path.exists("./ais-data/Data/") is False:
        os.mkdir("./ais-data/Data/")
    
    print(os.listdir("./ais-data/"))
except:
    print("Cloning repo failed")

CompletedProcess(args=['git', 'clone', 'https://github.com/CSBP-CPSE/ais-data.git'], returncode=0, stdout='', stderr="Cloning into 'ais-data'...\n")
['.git', '.ipynb_checkpoints', 'ODI_Marine_Ports_v0.1.csv', 'README.md', 'ais_analysis.ipynb', 'export_data.ipynb', 'move_data.ipynb', 'vancouver_port.ipynb', 'Data']


In [4]:
# Load port coordinates
ports = pd.read_csv("./ais-data/ODI_Marine_Ports_v0.1.csv",
                    usecols=['MunicipalityName', 'ERNAME', 'Latitude', 'Longitude'])

# Filter for top 20 ports according to https://www144.statcan.gc.ca/nats-stna/tables-tableaux/tbl11-4a/tbl11-4a-CAN-eng.htm
# St Romuald is the only one in Lévis
top20 = ['Saint John', 'Montréal', 'Hamilton', 'Halifax', 'Windsor']
ports = ports[(ports['ERNAME'].str.contains('|'.join(top20), case=False)) | 
              (ports['MunicipalityName'].isin(['Come-by-Chance', 'St Romuald', 'Victoria'
                                               'Strait of Canso Port(Formerly Port Hawkesbury)',
                                               'Port of Sorel', 'Nanticoke', 'Baie-Comeau',
                                               'Sault-Ste-Marie', 'Port Alfred']))]

# Reset index
ports.reset_index(drop=True, inplace=True)

display(ports)

# Recommended to try resolution sizes of 6-9 from Port Indicators Demo
# Use resolution 8 with radius of 2
ports['H3_int_index_8'] = ports[['Latitude','Longitude']].apply(lambda x: h3.geo_to_h3(x[0],x[1], 8), axis=1)
ports['h8_rings'] = ports['H3_int_index_8'].apply(lambda x: list(h3.k_ring(x, 2)))
ports = ports.explode('h8_rings', ignore_index=True)
ports['h8_rings_decimal'] = ports['h8_rings'].apply(lambda x: int(x, 16))
ports = ports['h8_rings_decimal'].unique().tolist()

print(len(ports))

Unnamed: 0,MunicipalityName,ERNAME,Longitude,Latitude
115,Eastern Passage,Halifax,-63.052389,44.632972
122,Ingram River,Halifax,-63.966704,44.67266
161,Sambro,Halifax,-63.59908,44.477386
162,Shad Bay,Halifax,-63.79043,44.5233
165,Ship Habour,Halifax,-62.865669,44.804014
169,St. Margarets Bay,Halifax,-64.057864,44.637992
207,Bayside,Saint John--St. Stephen,-67.139197,45.158732
208,Beaver Harbour,Saint John--St. Stephen,-66.741763,45.068701
210,Blacks Harbour,Saint John--St. Stephen,-66.807212,45.046521
215,Campobello,Saint John--St. Stephen,-66.958063,44.893597


884


In [5]:
%%time
# Get data from earliest date to 2022-08-01
start_date = datetime.fromisoformat("2020-01-01 00:00:00") # Earliest date (2018-12-01)
end_date = start_date + pd.DateOffset(months=1, seconds=-1) # Last day of month at 23:59:59

df = af.get_ais(spark,
                start_date=start_date,
                end_date=end_date,
                h3_list=ports)

df.show()

+------------------+------------+---------+-------------------+------------------+------------------+-------+------------+--------+-----------+----------------+--------------------+------------+------+-----+------------+---------+-----------------+--------+-------+---+-----+------------------+-------+--------------------+---------------+------+-------------------+-------------------+----------------+--------------------+----+--------------------+---------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|    H3_int_index_8|message_type|     mmsi|      dt_insert_utc|         longitude|          latitude|    imo| vessel_name|callsign|vessel_type|vessel_type_code|   vessel_type_cargo|vessel_class|length|width|flag_country|flag_code|      destination|

In [6]:
# Get unique values in vessel_type_cargo and vessel_class
# df.select('vessel_type').distinct().show(n=50, truncate=False)

# Output:
# +------------------------------------+
# |vessel_type                         |
# +------------------------------------+
# |Sailing                             |
# |Tanker                              |
# |Military                            |
# |Towing                              |
# |Reserved                            |
# |SAR                                 |
# |Unknown                             |
# |UNAVAILABLE                         |
# |Other                               |
# |Tug                                 |
# |Law Enforcement                     |
# |Pleasure Craft                      |
# |Passenger                           |
# |Diving                              |
# |Fishing                             |
# |Port Tender                         |
# |Spare                               |
# |Pilot                               |
# |WIG                                 |
# |Dredging                            |
# |Not Available                       |
# |Cargo                               |
# |Vessel With Anti-Pollution Equipment|
# |HSC                                 |
# +------------------------------------+

In [7]:
# Filter by vessel type
# vessel_list = ['Cargo', 'Tanker']

In [8]:
%%time

# Try to convert it to pandas df
try:
    df = df.toPandas()
    display(df)
    print(df['dt_pos_utc'].min())
    print(df['dt_pos_utc'].max())
except:
    print("The dataframe was too large to convert to Pandas")

Unnamed: 0,H3_int_index_8,message_type,mmsi,dt_insert_utc,longitude,latitude,imo,vessel_name,callsign,vessel_type,vessel_type_code,vessel_type_cargo,vessel_class,length,width,flag_country,flag_code,destination,eta,draught,sog,cog,rot,heading,nav_status,nav_status_code,source,dt_pos_utc,dt_static_utc,vessel_type_main,vessel_type_sub,eeid,source_filename,H3index_0,H3_int_index_0,H3_int_index_1,H3_int_index_2,H3_int_index_3,H3_int_index_4,H3_int_index_5,H3_int_index_6,H3_int_index_7,H3_int_index_9,H3_int_index_10,H3_int_index_11,H3_int_index_12,H3_int_index_13,H3_int_index_14,H3_int_index_15
0,613182494623137791,1,316001696,2020-01-01 13:35:39,-84.360567,46.502700,6800919.0,TIM S DOOL,VGPY,Cargo,,No Additional Information,A,223.0,23.0,Canada,,HAMILTON,1031000,8.1,0.0,308.0,0.000000,224.0,Under Way Using Engine,0,S-AIS,2020-01-01 13:35:39,2020-01-01 13:35:39,Bulk Carrier,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094248935423,622189693876174847,626693293503516671,631196893130885119,635700492758255551,640204092385626031,644707692012996526
1,613182494618943487,27,316002238,2020-01-01 00:29:38,-84.350000,46.503333,5141483.0,ANGLIAN LADY,VOLP,Other,,No Additional Information,A,80.0,20.0,Canada,,SAULT SAINT MARIE,12260700,4.5,1.0,86.0,0.000000,0.0,Under Way Using Engine,0,S-AIS,2020-01-01 00:29:38,2020-01-01 00:29:38,Tug,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094244478975,622189693872537599,626693293499895807,631196893127265791,635700492754636159,640204092382006615,644707692009377105
2,613182494618943487,1,316004370,2020-01-01 03:56:59,-84.345167,46.503333,9613941.0,G3 MARQUIS,XJBO,Cargo,,No Additional Information,A,226.0,24.0,Canada,,WINDSOR,12301700,5.7,1.6,266.0,0.000000,266.0,Under Way Using Engine,0,S-AIS,2020-01-01 03:56:59,2020-01-01 03:56:59,Bulk Carrier,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094244741119,622189693872013311,626693293499355135,631196893126724607,635700492754094911,640204092381465399,644707692008835888
3,613182494870601727,3,316004370,2020-01-01 04:50:15,-84.376500,46.501667,9613941.0,G3 MARQUIS,XJBO,Cargo,,No Additional Information,A,226.0,24.0,Canada,,WINDSOR,12301700,5.7,7.8,261.0,-7.544209,257.0,Under Way Using Engine,0,S-AIS,2020-01-01 04:50:15,2020-01-01 04:50:15,Bulk Carrier,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295661998079,608678895255814143,617686094497447935,622189694124752895,626693293752111103,631196893379488767,635700493006859135,640204092634229615,644707692261600104
4,613182494623137791,1,316004910,2020-01-01 09:46:51,-84.359923,46.502780,5166392.0,CUYAHOGA,CFG6460,Cargo,,,A,189.0,18.0,Canada,,THUNDER BAY,12300800,4.8,0.0,195.2,0.000000,87.0,Under Way Using Engine,0,S-AIS,2020-01-01 09:46:51,2020-01-01 09:46:51,Bulk Carrier,Self Discharging Bulk Carrier,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094248935423,622189693876174847,626693293503537151,631196893130905599,635700492758276031,640204092385646471,644707692013016966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301934,613182494618943487,1,366999402,2020-01-14 15:25:32,-84.348133,46.503767,,BILLMAIER,AEHT,Tug,,,A,0.0,0.0,USA,,SOO LOCK,2460,0.0,1.3,69.7,0.000000,0.0,Under Way Using Engine,0,S-AIS,2020-01-14 15:25:32,2020-01-14 15:25:32,,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094244478975,622189693871652863,626693293499015167,631196893126384639,635700492753754943,640204092381125551,644707692008495898
301935,613182494618943487,1,366999409,2020-01-14 17:55:16,-84.347333,46.503833,,OWEN M FREDRICK,AEKJ,Other,,,A,21.0,5.0,USA,,,2460,0.0,0.0,235.7,0.000000,245.0,Under Way Using Engine,0,S-AIS,2020-01-14 17:55:16,2020-01-14 17:55:16,,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094244741119,622189693872078847,626693293499437055,631196893126814719,635700492754185087,640204092381555559,644707692008926050
301936,613182494618943487,1,366999409,2020-01-14 21:04:35,-84.355167,46.502833,,OWEN M FREDRICK,AEKJ,Other,,,A,21.0,5.0,USA,,,2460,0.0,0.2,118.9,0.000000,269.0,Under Way Using Engine,0,S-AIS,2020-01-14 21:04:35,2020-01-14 21:04:35,,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094245003263,622189693872242687,626693293499588607,631196893126956031,635700492754326463,640204092381696927,644707692009067419
301937,613182494618943487,1,366999409,2020-01-14 17:27:35,-84.347333,46.504000,,OWEN M FREDRICK,AEKJ,Other,,,A,21.0,5.0,USA,,,2460,0.0,0.0,235.7,0.000000,245.0,Under Way Using Engine,0,S-AIS,2020-01-14 17:27:35,2020-01-14 17:27:35,,,,s3a://ungp-ais-data-historical-backup/exact-ea...,8027fffffffffff,577164439745200127,581659243279548415,586161193639477247,590664518388940799,595168100836442111,599671696168845311,604175295393562623,608678895004155903,617686094244741119,622189693872078847,626693293499437055,631196893126804991,635700492754175423,640204092381545895,644707692008916387


2020-01-01 00:00:28
2020-01-31 23:59:59
CPU times: user 36.6 s, sys: 601 ms, total: 37.2 s
Wall time: 9min 23s


In [None]:
%%time
# Split DF into smaller files and save them to csv
# Each file will have 100000 rows
list_df = np.array_split(df, (len(df.index) // 100000) + 1)

for i in range(len(list_df)):
    list_df[i].to_csv("./ais-data/Data/top20_ports_{}_to_{}_chunk_{}.csv".format(datetime.date(start_date),
                                                                                 datetime.date(end_date),
                                                                                 i), index=False)

In [None]:
# Change directory
os.chdir("./ais-data")
print(os.listdir("./Data/"))

In [None]:
# Git Config
std_out = subprocess.run(["git", "config", "--global", "user.email", '"dennishuynh3@gmail.com"'], capture_output=True, text=True)
print(std_out)
std_out = subprocess.run(["git", "config", "--global", "user.name", '"DennisH3"'], capture_output=True, text=True)
print(std_out)

# Git add and commit
std_out = subprocess.run(["git", "add", "."], capture_output=True, text=True)
print(std_out)
std_out = subprocess.run(["git", "commit", "-m", "Upload data"], capture_output=True, text=True).stdout
print(std_out)

In [None]:
# Get credentials
un = "DennisH3"
pw = getpass.getpass(prompt="Please enter PAT")

In [None]:
push = "git push --repo https://{}:{}@github.com/CSBP-CPSE/ais-data.git".format(un, pw)

std_out = subprocess.run([push], capture_output=True, text=True, shell=True).stderr
print(std_out)