## Squeeze the data based on Polygon
Porvided the polgons, the code creates resultant files with ports names and each file from previous step is filtered such a way that any data points that falls within any of the provided polygon will be saved in respective port file in defined location.

In [5]:
import os
import pandas as pd
from shapely.geometry import Point, Polygon

CHUNK_SIZE = 1000_000
input_folder = r'D:\Thesis Work MLS\Norway Data Filtered'
output_folder = os.path.join(input_folder, "Port_Split_Result")
os.makedirs(output_folder, exist_ok=True)

PORT_POLYGONS = {
    "Bergen Terminal": Polygon([(5.3119966, 60.4048807), (5.2689096, 60.3941969), (5.3233262, 60.3710367), (5.3526803, 60.3832551), (5.3119966, 60.4048807)]),
    "Stavanger Westport Terminal": Polygon([(5.5596949, 58.9367236), (5.5461337, 58.9152819), (5.5955721, 58.9039355), (5.6122233, 58.9324718), (5.5596949, 58.9367236)]),
    "Kristiansand Terminal": Polygon([(8.0021276, 58.157283), (7.9581823, 58.1337279), (8.0220403, 58.1192246), (8.0388631, 58.1449638), (8.0021276, 58.157283)]),
    "Drammen Port": Polygon([(10.2470522, 59.7538991), (10.1914339, 59.7347841), (10.2879076, 59.7134939), (10.3028421, 59.7476728), (10.2470522, 59.7538991)]),
    "Oslo Port Area": Polygon([(10.7220254, 59.9127971), (10.6775651, 59.9016079), (10.7091508, 59.8693961), (10.7781586, 59.8841278), (10.7587609, 59.9135716), (10.7220254, 59.9127971)])
}

port_bounds = {port: poly.bounds for port, poly in PORT_POLYGONS.items()}
port_paths = {port: os.path.join(output_folder, f"{port}.csv") for port in PORT_POLYGONS}
header_written = {port: False for port in PORT_POLYGONS}
summary = {port: 0 for port in PORT_POLYGONS}

input_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.csv')]

print(f"Processing {len(input_files)} files...")

for fname in input_files:
    file_path = os.path.join(input_folder, fname)
    print(f"▶ {fname}")
    try:
        chunk_iter = pd.read_csv(
            file_path,
            chunksize=CHUNK_SIZE,
            dtype=str,
            engine='python',
            escapechar='\\'
        )
        for chunk in chunk_iter:
            # Drop NA in lat/lon and filter out header lines (column name as value)
            chunk = chunk.dropna(subset=['latitude', 'longitude']).copy()
            chunk = chunk[~chunk['longitude'].str.lower().eq('longitude')]
            chunk = chunk[~chunk['latitude'].str.lower().eq('latitude')]
            # Convert to float (non-numeric will become NaN and dropped)
            chunk['longitude'] = pd.to_numeric(chunk['longitude'], errors='coerce')
            chunk['latitude'] = pd.to_numeric(chunk['latitude'], errors='coerce')
            chunk = chunk.dropna(subset=['longitude', 'latitude'])

            for port, poly in PORT_POLYGONS.items():
                minx, miny, maxx, maxy = port_bounds[port]
                bb_mask = (
                    (chunk['longitude'] >= minx) & (chunk['longitude'] <= maxx) &
                    (chunk['latitude']  >= miny) & (chunk['latitude']  <= maxy)
                )
                candidate = chunk[bb_mask]
                if candidate.empty:
                    continue
                mask = candidate.apply(lambda r: poly.contains(Point(r['longitude'], r['latitude'])), axis=1)
                filtered = candidate[mask]
                if not filtered.empty:
                    filtered.to_csv(
                        port_paths[port],
                        mode='a',
                        index=False,
                        header=not header_written[port]
                    )
                    header_written[port] = True
                    summary[port] += len(filtered)
    except Exception as e:
        print(f"  ❌ Error in file {fname}: {e}")

print("\nRows retained per port:")
for port, cnt in summary.items():
    print(f" • {port}: {cnt:,} rows → {port_paths[port]}")
print("\nDone!")


Processing 121 files...
▶ hais_2024-01-01.csv
▶ hais_2024-01-02.csv
▶ hais_2024-01-03.csv
▶ hais_2024-01-04.csv
▶ hais_2024-01-05.csv
▶ hais_2024-01-06.csv
▶ hais_2024-01-07.csv
▶ hais_2024-01-08.csv
▶ hais_2024-01-09.csv
▶ hais_2024-01-10.csv
▶ hais_2024-01-11.csv
▶ hais_2024-01-12.csv
▶ hais_2024-01-13.csv
▶ hais_2024-01-14.csv
▶ hais_2024-01-15.csv
▶ hais_2024-01-16.csv
▶ hais_2024-01-17.csv
▶ hais_2024-01-18.csv
▶ hais_2024-01-19.csv
▶ hais_2024-01-20.csv
▶ hais_2024-01-21.csv
▶ hais_2024-01-22.csv
▶ hais_2024-01-23.csv
▶ hais_2024-01-24.csv
▶ hais_2024-01-25.csv
▶ hais_2024-01-26.csv
▶ hais_2024-01-27.csv
▶ hais_2024-01-28.csv
▶ hais_2024-01-29.csv
▶ hais_2024-01-30.csv
▶ hais_2024-01-31.csv
▶ hais_2024-02-01.csv
▶ hais_2024-02-02.csv
▶ hais_2024-02-03.csv
▶ hais_2024-02-04.csv
▶ hais_2024-02-05.csv
▶ hais_2024-02-06.csv
▶ hais_2024-02-07.csv
▶ hais_2024-02-08.csv
▶ hais_2024-02-09.csv
▶ hais_2024-02-10.csv
▶ hais_2024-02-11.csv
▶ hais_2024-02-12.csv
▶ hais_2024-02-13.csv
▶ hais_2