## Remove vessels below 65 meters and keep one record for 2 minutes timeframe 
Since below 65 meters length vessel will not be a true cargo or tanker vessel so we can drop those vessels. Also, there are too many records for each unique vessel between 1 minute timeframe based on timestamps provided in data. So we can also reduce the data size by reducing the record for each unique vessel to keep 1 record per two minutes timeframe based on timestamps in the data.

In [1]:
import os
import pandas as pd

from datetime import timedelta

# --- Folders ---
input_folder = r'D:\Thesis Work MLS\Norway Data Filtered\Port_Split_Result'
output_folder = os.path.join(input_folder, 'Cleaned')
os.makedirs(output_folder, exist_ok=True)

files = [f for f in os.listdir(input_folder) if f.lower().endswith('.csv')]

for fname in files:
    print(f"\nProcessing {fname} ...")
    in_path = os.path.join(input_folder, fname)
    out_path = os.path.join(output_folder, fname)

    # Read file as string, then convert date and length columns explicitly
    df = pd.read_csv(in_path, dtype=str)
    # Drop rows with missing key fields
    df = df.dropna(subset=['mmsi', 'date_time_utc', 'length'])

    # Convert to correct types
    df['length'] = pd.to_numeric(df['length'], errors='coerce')
    df = df.dropna(subset=['length'])
    df = df[df['length'] >= 65]   # Only vessels >=65 meters

    # Make sure timestamp parsing is strict!
    df['date_time_utc'] = pd.to_datetime(df['date_time_utc'], format='%Y-%m-%dT%H:%M:%S.%fZ', errors='coerce')
    df = df.dropna(subset=['date_time_utc'])

    before_vessels = df['mmsi'].nunique()

    # For each vessel, keep one row every 2 min based on timestamp
    df = df.sort_values(['mmsi', 'date_time_utc'])
    keep_rows = []
    for mmsi, group in df.groupby('mmsi'):
        # For each group, keep first row, then only keep rows >= 2min apart from last kept
        last_time = pd.Timestamp.min
        for idx, row in group.iterrows():
            if row['date_time_utc'] >= last_time + timedelta(minutes=2):
                keep_rows.append(idx)
                last_time = row['date_time_utc']

    filtered_df = df.loc[keep_rows]
    after_vessels = filtered_df['mmsi'].nunique()

    # Save result (keep all columns as original)
    filtered_df.to_csv(out_path, index=False)

    print(f" • Unique vessels before: {before_vessels}")
    print(f" • Unique vessels after:  {after_vessels}")
    print(f" • Rows written: {len(filtered_df)} to {out_path}")

print("\nALL PORTS CLEANED AND SAVED.")



Processing Bergen Terminal.csv ...
 • Unique vessels before: 129
 • Unique vessels after:  129
 • Rows written: 428016 to D:\Thesis Work MLS\Norway Data Filtered\Port_Split_Result\Cleaned\Bergen Terminal.csv

Processing Drammen Port.csv ...
 • Unique vessels before: 77
 • Unique vessels after:  77
 • Rows written: 122155 to D:\Thesis Work MLS\Norway Data Filtered\Port_Split_Result\Cleaned\Drammen Port.csv

Processing Kristiansand Terminal.csv ...
 • Unique vessels before: 81
 • Unique vessels after:  81
 • Rows written: 119692 to D:\Thesis Work MLS\Norway Data Filtered\Port_Split_Result\Cleaned\Kristiansand Terminal.csv

Processing Oslo Port Area.csv ...
 • Unique vessels before: 152
 • Unique vessels after:  152
 • Rows written: 177779 to D:\Thesis Work MLS\Norway Data Filtered\Port_Split_Result\Cleaned\Oslo Port Area.csv

Processing Stavanger Westport Terminal.csv ...
 • Unique vessels before: 138
 • Unique vessels after:  138
 • Rows written: 336406 to D:\Thesis Work MLS\Norway Dat