## AIS_logs Daten

In [1]:
import pandas as pd

In [None]:
file = pd.read_parquet("/Users/jakobschneider/Downloads/2024_NOAA_AIS_logs_01.parquet")
file.head()

In [None]:
file.columns

Index(['MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading',
       'Status'],
      dtype='object')

In [None]:
file["BaseDateTime"].min()

Timestamp('2024-01-01 00:00:00')

In [None]:
file["BaseDateTime"].max()

Timestamp('2024-01-31 23:59:59')

In [None]:
file["MMSI"].nunique()

33635

In [None]:
file.shape

(221952082, 8)

In [None]:
import duckdb

file_path = "/Users/jakobschneider/Downloads/2024_NOAA_AIS_logs_01.parquet"

columns = ['MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading', 'Status']

con = duckdb.connect()

# Build SELECT expression: COUNT(*) - COUNT(col) = number of NULLs
exprs = ",\n    ".join([
    f"COUNT(*) - COUNT({col}) AS missing_{col}"
    for col in columns
])

query = f"""
SELECT
    {exprs}
FROM read_parquet(?)
"""

result_wide = con.execute(query, [file_path]).df()
print(result_wide)


# reshape to long format
missing_long = (
    result_wide
    .T
    .reset_index()
    .rename(columns={"index": "column_name", 0: "missing_values"})
)

print(missing_long)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   missing_MMSI  missing_BaseDateTime  missing_LAT  missing_LON  missing_SOG  \
0             0                     0            0            0            0   

   missing_COG  missing_Heading  missing_Status  
0            0                0        56716142  
            column_name  missing_values
0          missing_MMSI               0
1  missing_BaseDateTime               0
2           missing_LAT               0
3           missing_LON               0
4           missing_SOG               0
5           missing_COG               0
6       missing_Heading               0
7        missing_Status        56716142


In [None]:
# Gesamtzahl verschiedener Schiffe im Datensatz
import duckdb

con = duckdb.connect()

query_total = """
SELECT COUNT(DISTINCT MMSI) AS distinct_ships
FROM read_parquet(?)
"""

distinct_ships = con.execute(query_total, [file_path]).fetchone()[0]
print("Distinct ships in dataset:", distinct_ships)

Distinct ships in dataset: 33635


In [None]:
# Anzahl verschiedener Schiffe pro Tag
query_per_day = """
SELECT 
    DATE(BaseDateTime) AS day,
    COUNT(DISTINCT MMSI) AS distinct_ships
FROM read_parquet(?)
GROUP BY day
ORDER BY day
"""

ships_per_day = con.execute(query_per_day, [file_path]).df()
print(ships_per_day.head(31))

          day  distinct_ships
0  2024-01-01           14868
1  2024-01-02           15130
2  2024-01-03           15043
3  2024-01-04           14990
4  2024-01-05           14825
5  2024-01-06           14813
6  2024-01-07           14995
7  2024-01-08           15008
8  2024-01-09           14283
9  2024-01-10           15082
10 2024-01-11           15247
11 2024-01-12           15391
12 2024-01-13           15208
13 2024-01-14           14581
14 2024-01-15           15135
15 2024-01-16           15414
16 2024-01-17           14976
17 2024-01-18           14794
18 2024-01-19           15329
19 2024-01-20           15155
20 2024-01-21           14047
21 2024-01-22           14659
22 2024-01-23           14909
23 2024-01-24           15570
24 2024-01-25           14949
25 2024-01-26           15458
26 2024-01-27           15517
27 2024-01-28           15494
28 2024-01-29           15138
29 2024-01-30           15844
30 2024-01-31           15826


: 

## AIS_ships Daten

In [7]:
file_ships = pd.read_parquet("/Users/jakobschneider/Downloads/2024_NOAA_AIS_ships_01.parquet")
file_ships = file_ships.replace("<Unknown>", pd.NA)
file_ships.head()

Unnamed: 0,MMSI,VesselName,IMO,CallSign,VesselType,Length,Width,Draft,Cargo,TransceiverClass,BaseDateTime
0,0,CG49420,,NWHE,51.0,82.0,12.0,1.6,51.0,B,2024-01-01 00:01:59
1,1,EVENING STAR,IMO0000000,WCV5672,30.0,21.0,6.0,2.0,90.0,B,2024-01-27 23:10:45
2,8,29,,,70.0,78.0,0.0,1.1,0.0,A,2024-01-31 16:57:25
3,11,CONSTITUTION,IMO0000007,GC 680,90.0,53.0,38.0,0.0,90.0,A,2024-01-06 02:26:19
4,111,BOOMVANG,IMO0000001,EB643,90.0,47.0,35.0,0.0,90.0,A,2024-01-06 01:35:07


In [8]:
file_ships.shape

(33635, 11)

In [9]:
file_ships.columns

Index(['MMSI', 'VesselName', 'IMO', 'CallSign', 'VesselType', 'Length',
       'Width', 'Draft', 'Cargo', 'TransceiverClass', 'BaseDateTime'],
      dtype='object')

In [10]:
file_ships["MMSI"].nunique()

33635

In [11]:
file_ships.isna().sum()

MMSI                    0
VesselName           1215
IMO                  6674
CallSign             8561
VesselType           1138
Length               1213
Width                1268
Draft               19376
Cargo               19376
TransceiverClass        0
BaseDateTime            0
dtype: int64

In [14]:
# Prüfen, ob Schiffe aus TransceiverClass A bessere Daten liefern
df_a = file_ships[file_ships['TransceiverClass'] == 'A'].copy()
print(file_ships["TransceiverClass"].value_counts())
df_a.isna().sum()

TransceiverClass
B    18315
A    15320
Name: count, dtype: int64


MMSI                   0
VesselName           939
IMO                 6179
CallSign            1701
VesselType           925
Length               947
Width                963
Draft               1095
Cargo               1095
TransceiverClass       0
BaseDateTime           0
dtype: int64