## Time wise Filtration
As there are multiple records of each unique vessel within 1 minute timeframe which increases the data size too much so to make analysis more easy and reduce the file size further, i have kept only 1 record within 2 minutes timeframe per mmsi based on timestamps.
## Length of vessels
I have dropped all the vessels below 65 meters length as those will most probably not be cargo vessels actually.

In [1]:
import pandas as pd
from pathlib import Path

# --- CONFIGURATION ---
INPUT_DIR = Path(r'D:\Thesis Work MLS\Denmark AIS data\processed_files\ports_filtered_flat')
OUTPUT_DIR = Path(r'D:\Thesis Work MLS\Denmark AIS data\processed_files\timefiltered_flat')
OUTPUT_DIR.mkdir(exist_ok=True)

TIMESTAMP_COL = '# Timestamp'
MMSI_COL = 'MMSI'
LENGTH_COL = 'Length'
SHIPTYPE_COL = 'Ship type'

# --- Smart per-vessel thinning function ---
def thin_per_vessel(df, time_col, min_seconds=120):
    keep_idx = []
    last_time = None
    for i, row in df.iterrows():
        curr_time = row[time_col]
        if last_time is None or (curr_time - last_time).total_seconds() >= min_seconds:
            keep_idx.append(i)
            last_time = curr_time
    return df.loc[keep_idx]

# --- Summary to show in notebook
summaries = []

for file in sorted(INPUT_DIR.glob("*.csv")):
    print(f"\nProcessing {file.name}...")
    df = pd.read_csv(file, low_memory=False)

    # --- Vessel Length Filtering ---
    df[LENGTH_COL] = pd.to_numeric(df[LENGTH_COL], errors='coerce')
    df = df[df[LENGTH_COL] >= 65]   # Only vessels >= 65 meters

    # --- Ship type as string (if needed) ---
    df[SHIPTYPE_COL] = df[SHIPTYPE_COL].astype(str)

    # --- Timestamp parsing (robust) ---
    df[TIMESTAMP_COL] = pd.to_datetime(df[TIMESTAMP_COL], errors='coerce', utc=True, dayfirst=True)
    # Check for parsing problems
    if df[TIMESTAMP_COL].isnull().mean() > 0.01:
        print(f"WARNING: {df[TIMESTAMP_COL].isnull().sum()} timestamps could not be parsed in {file.name}. Check format or missing values.")

    df = df.dropna(subset=[TIMESTAMP_COL, MMSI_COL])

    # --- Sort by vessel and time ---
    df = df.sort_values([MMSI_COL, TIMESTAMP_COL]).reset_index(drop=True)

    # --- PER-VESSEL 2-minute thinning (always from last kept) ---
    thinned_list = []
    for mmsi, vessel_df in df.groupby(MMSI_COL):
        vessel_df = vessel_df.sort_values(TIMESTAMP_COL)
        thinned = thin_per_vessel(vessel_df, TIMESTAMP_COL, min_seconds=120)
        thinned_list.append(thinned)
    filtered = pd.concat(thinned_list, ignore_index=True)

    # --- Stats before and after ---
    before_total = df[MMSI_COL].nunique()
    before_cargo = df[df[SHIPTYPE_COL].str.lower().str.contains('cargo')][MMSI_COL].nunique()
    before_tanker = df[df[SHIPTYPE_COL].str.lower().str.contains('tanker')][MMSI_COL].nunique()

    after_total = filtered[MMSI_COL].nunique()
    after_cargo = filtered[filtered[SHIPTYPE_COL].str.lower().str.contains('cargo')][MMSI_COL].nunique()
    after_tanker = filtered[filtered[SHIPTYPE_COL].str.lower().str.contains('tanker')][MMSI_COL].nunique()

    # --- Save filtered file ---
    output_path = OUTPUT_DIR / file.name
    filtered.to_csv(output_path, index=False)

    # --- Append summary ---
    summaries.append({
        'File': file.name,
        'Unique vessels (before)': before_total,
        'Unique vessels (after)': after_total,
        'Cargo vessels (before)': before_cargo,
        'Cargo vessels (after)': after_cargo,
        'Tanker vessels (before)': before_tanker,
        'Tanker vessels (after)': after_tanker,
        'Rows (before)': len(df),
        'Rows (after)': len(filtered)
    })

# --- Display summary in notebook ---
summary_df = pd.DataFrame(summaries)
display(summary_df)



Processing Aabenraa.csv...

Processing Aalborg.csv...

Processing Aarhaus.csv...

Processing Copenhagen.csv...

Processing Esbjerg.csv...

Processing Fredericia.csv...

Processing Kalundborg.csv...


Unnamed: 0,File,Unique vessels (before),Unique vessels (after),Cargo vessels (before),Cargo vessels (after),Tanker vessels (before),Tanker vessels (after),Rows (before),Rows (after)
0,Aabenraa.csv,73,73,57,57,16,16,181905,55672
1,Aalborg.csv,164,164,139,139,25,25,546154,116853
2,Aarhaus.csv,224,224,177,177,47,47,1464292,219066
3,Copenhagen.csv,51,51,47,47,4,4,531825,74301
4,Esbjerg.csv,133,133,107,107,26,26,1898603,348250
5,Fredericia.csv,201,201,132,132,69,69,786654,132143
6,Kalundborg.csv,123,123,53,53,70,70,848706,166365
