## Here initial data filtration is carried out
Removed unncessary columns form raw data file to reduce the size of each file

In [1]:
import pandas as pd
from pathlib import Path

# --- CONFIG ---
CHUNK_SIZE = 300_000
DROP_COLS = [
    'Data source type',
    'A','B','C','D',
    'Type of position fixing device',
    'Type of mobile',
    'Cargo type'
]
KEEP_SHIP_TYPES = {'Cargo', 'Tanker', 'Undefined'}
UNKNOWN_REPLACEMENT = '.'

# --- PATHS ---
input_folder  = Path(r"D:\Thesis Work MLS\Denmark AIS data")
output_folder = input_folder / 'processed_files'
output_folder.mkdir(exist_ok=True)

# --- SUMMARY STORAGE ---
summary = []

# --- PROCESS ---
csv_files = sorted(input_folder.glob("*.csv"))
total_files = len(csv_files)
print(f"Found {total_files} CSV files in {input_folder}")

for idx, csv_path in enumerate(csv_files, 1):
    print(f"[{idx}/{total_files}] Starting ▶ {csv_path.name}")
    orig_rows = 0
    kept_rows = 0
    first_chunk = True
    out_file = output_folder / csv_path.name

    for chunk in pd.read_csv(csv_path, chunksize=CHUNK_SIZE):
        orig_rows += len(chunk)

        # drop unwanted cols (ignore missing)
        chunk = chunk.drop(columns=DROP_COLS, errors='ignore')
        # filter longitude
        chunk = chunk[chunk['Longitude'] != 0]
        # filter ship type
        chunk = chunk[chunk['Ship type'].isin(KEEP_SHIP_TYPES)]
        # replace Unknown
        chunk = chunk.replace('Unknown', UNKNOWN_REPLACEMENT)

        kept_rows += len(chunk)

        # write out
        if first_chunk:
            chunk.to_csv(out_file, index=False, mode='w')
            first_chunk = False
        else:
            chunk.to_csv(out_file, index=False, header=False, mode='a')

    summary.append({
        'file': csv_path.name,
        'initial_rows': orig_rows,
        'processed_rows': kept_rows
    })
    print(f"[{idx}/{total_files}] Finished ✓ kept {kept_rows}/{orig_rows} rows\n")

# write summary
summary_df = pd.DataFrame(summary)
summary_df.to_csv(output_folder / 'summary.csv', index=False)
print("All done!  Summary written to", (output_folder / 'summary.csv'))


Found 110 CSV files in D:\Thesis Work MLS\Denmark AIS data
[1/110] Starting ▶ aisdk-2025-01-01.csv
[1/110] Finished ✓ kept 5972735/14166456 rows

[2/110] Starting ▶ aisdk-2025-01-02.csv
[2/110] Finished ✓ kept 5654470/14517030 rows

[3/110] Starting ▶ aisdk-2025-01-03.csv
[3/110] Finished ✓ kept 6204511/15264341 rows

[4/110] Starting ▶ aisdk-2025-01-04.csv
[4/110] Finished ✓ kept 6558978/15820358 rows

[5/110] Starting ▶ aisdk-2025-01-05.csv
[5/110] Finished ✓ kept 6651933/15945213 rows

[6/110] Starting ▶ aisdk-2025-01-06.csv
[6/110] Finished ✓ kept 6236250/15081905 rows

[7/110] Starting ▶ aisdk-2025-01-07.csv
[7/110] Finished ✓ kept 6338535/14813237 rows

[8/110] Starting ▶ aisdk-2025-01-08.csv
[8/110] Finished ✓ kept 6434965/15121354 rows

[9/110] Starting ▶ aisdk-2025-01-09.csv
[9/110] Finished ✓ kept 6539890/15930906 rows

[10/110] Starting ▶ aisdk-2025-01-10.csv
[10/110] Finished ✓ kept 6986470/16423838 rows

[11/110] Starting ▶ aisdk-2025-01-11.csv
[11/110] Finished ✓ kept 678