# AIS Data Exploration

## Data download

In [5]:
from pathlib import Path

import httpx

In [9]:
# Download the file
url = "https://marinecadastre.gov/downloads/ais2024/ais-2024-01-01.parquet"
output_path = Path("../../data/raw/ais/ais-2024-01-01.parquet")
output_path.parent.mkdir(parents=True, exist_ok=True)

In [10]:
with httpx.stream("GET", url, follow_redirects=True) as response:
    response.raise_for_status()
    with open(output_path, "wb") as f:
        for chunk in response.iter_bytes(chunk_size=8192):
            f.write(chunk)

print(f"Downloaded: {output_path} ({output_path.stat().st_size / 1e6:.1f} MB)")

Downloaded: ../../data/raw/ais/ais-2024-01-01.parquet (227.2 MB)


## Data Exploration

In [11]:
import geopandas as gpd

df = gpd.read_parquet(output_path)
df.head(20)

In [25]:
print(f"Df Shape: {df.shape}")
print("x" * 100)
print(f" Data types: {df.dtypes}")
print("x" * 100)
print(f" Df Description: {df.describe()}")
print("x" * 100)
print(f" Null values: {df.isnull().sum()}")
print("x" * 100)
print(f" Columns: {df.columns.tolist()}")

Df Shape: (7293408, 16)
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 Data types: mmsi                     int32[pyarrow]
base_date_time    timestamp[s][pyarrow]
sog                      float[pyarrow]
cog                      float[pyarrow]
heading                  int32[pyarrow]
vessel_name                      string
imo                              string
call_sign                        string
vessel_type              int32[pyarrow]
status                   int32[pyarrow]
length                   float[pyarrow]
width                    int32[pyarrow]
draft                    float[pyarrow]
cargo                    int32[pyarrow]
transceiver                      string
geometry                       geometry
dtype: object
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 Df Description:                    mmsi       base_date_time        sog         cog  \
count         72934

In [28]:
df.vessel_type.value_counts()

vessel_type
31     1441365
37     1387385
52     1209413
30      545293
36      465413
        ...   
94         387
255        164
78          17
19           5
16           5
Name: count, Length: 72, dtype: int64[pyarrow]

In [29]:
# Check for empty strings in vessel_name
print(df["vessel_name"].value_counts().head(10))
print(df["imo"].value_counts().head(10))
print(df["transceiver"].value_counts().head(10))

vessel_name
FREEDOM         13936
LIBERTY          8746
                 6942
INDEPENDENCE     6275
MAVERICK         5585
AURORA           5247
TEXAS            4712
DAUNTLESS        4702
ENDEAVOR         4650
DESTINY          4363
Name: count, dtype: int64[pyarrow]
imo
              4319862
IMO0000001      16704
IMO0000101       3725
IMO0000004       2757
IMO8851273       2557
IMO0000526       2490
IMO8644383       2483
IMO0000032       2452
IMO8967632       2437
IMO9813644       2432
Name: count, dtype: int64[pyarrow]
transceiver
A    5283787
B    2009621
Name: count, dtype: int64[pyarrow]


In [30]:
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")

Memory usage: 0.81 GB
