# Define Oslo Port polygon and process each file one by one and getting chunck of data like 100000 at once so not to overload the system at once

In [None]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon

# Define the working folder (where raw CSV files are located)
raw_data_folder = os.getcwd()  # Assumes script is in the same folder as raw data

# Create a separate folder for Oslo Port processed files
oslo_folder = os.path.join(raw_data_folder, "processed_oslo_port")
os.makedirs(oslo_folder, exist_ok=True)

# Define Oslo Port polygon
oslo_polygon = Polygon([
    (10.6624659, 59.8957689), (10.7074412, 59.8788027), (10.7706126, 59.898524), 
    (10.7548197, 59.9150499), (10.6986865, 59.9118659), (10.6624659, 59.8957689)
])

# Columns to keep for filtering
use_columns = ["date_time_utc", "mmsi", "longitude", "latitude", "status", 
               "course_over_ground", "speed_over_ground", "rate_of_turn", 
               "maneuvre", "imo", "callsign", "ship_name", "ship_type", 
               "length", "draught"]

# Processing each file one by one
for file in sorted(os.listdir(raw_data_folder)):
    if file.startswith("hais_") and file.endswith(".csv"):  # Process only relevant CSV files
        file_path = os.path.join(raw_data_folder, file)
        output_path = os.path.join(oslo_folder, file.replace(".csv", "_oslo_filtered.csv"))

        print(f"🔄 Processing: {file}...")

        try:
            # Initialize row counters
            total_rows = 0
            filtered_rows = 0

            # Process in chunks to avoid memory issues
            chunk_size = 100000  
            chunks = pd.read_csv(
                file_path, 
                usecols=use_columns, 
                chunksize=chunk_size, 
                delimiter=",",   # Explicitly set separator
                dtype={"longitude": float, "latitude": float},  # Force correct data types
                on_bad_lines="warn",  # Ignore bad lines, log them instead of stopping
                encoding_errors="ignore"  # Skip encoding errors
            )

            for chunk in chunks:
                total_rows += len(chunk)

                # Drop NaN values in important columns
                chunk = chunk.dropna(subset=["longitude", "latitude"])

                # Convert coordinates to geometry
                chunk["geometry"] = [Point(xy) for xy in zip(chunk.longitude, chunk.latitude)]
                gdf = gpd.GeoDataFrame(chunk, geometry="geometry", crs="EPSG:4326")

                # Filter rows that fall within the Oslo Port polygon
                filtered_gdf = gdf[gdf.geometry.within(oslo_polygon)]

                if not filtered_gdf.empty:
                    filtered_rows += len(filtered_gdf)

                    # Save filtered data incrementally
                    filtered_gdf.drop(columns=["geometry"]).to_csv(output_path, mode="a", index=False)

            print(f"✔ Completed: {file} | Total Rows: {total_rows} | Kept: {filtered_rows}")

        except Exception as e:
            print(f"❌ Error processing {file}: {e}")

print("\n✅ All files processed for Oslo Port! Check the 'processed_oslo_port' folder.")


In [7]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
import csv

# Define the working folder (where raw CSV files are located)
raw_data_folder = os.getcwd()  # Assumes script is in the same folder as raw data

# Create a separate folder for remaining processed Oslo files
remaining_oslo_folder = os.path.join(raw_data_folder, "remaining_processed_oslo_files")
os.makedirs(remaining_oslo_folder, exist_ok=True)

# Define Oslo Port polygon
oslo_polygon = Polygon([
    (10.6624659, 59.8957689), (10.7074412, 59.8788027), (10.7706126, 59.898524), 
    (10.7548197, 59.9150499), (10.6986865, 59.9118659), (10.6624659, 59.8957689)
])

# List of problematic files to process
problematic_files = [
    "hais_2024-01-05.csv", "hais_2024-02-13.csv", "hais_2024-02-14.csv",
    "hais_2024-02-15.csv", "hais_2024-02-16.csv", "hais_2024-02-19.csv",
    "hais_2024-02-22.csv", "hais_2024-04-03.csv", "hais_2024-04-04.csv",
    "hais_2024-04-05.csv", "hais_2024-04-26.csv", "hais_2024-04-30.csv"
]

# Columns to keep for filtering
use_columns = ["date_time_utc", "mmsi", "longitude", "latitude", "status", 
               "course_over_ground", "speed_over_ground", "rate_of_turn", 
               "maneuvre", "imo", "callsign", "ship_name", "ship_type", 
               "length", "draught"]

# Processing each problematic file
for file in problematic_files:
    file_path = os.path.join(raw_data_folder, file)
    output_path = os.path.join(remaining_oslo_folder, file.replace(".csv", "_oslo_filtered.csv"))

    if os.path.exists(file_path):  # Ensure file exists
        print(f"🔄 Re-processing: {file}...")

        try:
            total_rows = 0
            filtered_rows = 0

            # Process in chunks to avoid memory issues
            chunk_size = 100000  
            chunks = pd.read_csv(
                file_path, 
                usecols=use_columns, 
                chunksize=chunk_size, 
                delimiter=",",  # Explicit separator
                dtype={"longitude": float, "latitude": float},  # Force correct data types
                on_bad_lines="warn",  # Ignore and warn about bad lines
                encoding_errors="ignore",  # Ignore encoding errors
                quoting=csv.QUOTE_NONE  # Treat quotes as regular characters
            )

            for chunk in chunks:
                total_rows += len(chunk)

                # Drop NaN values in critical columns
                chunk = chunk.dropna(subset=["longitude", "latitude"])

                # Convert coordinates to geometry
                chunk["geometry"] = [Point(xy) for xy in zip(chunk.longitude, chunk.latitude)]
                gdf = gpd.GeoDataFrame(chunk, geometry="geometry", crs="EPSG:4326")

                # Filter rows that fall within the Oslo Port polygon
                filtered_gdf = gdf[gdf.geometry.within(oslo_polygon)]

                if not filtered_gdf.empty:
                    filtered_rows += len(filtered_gdf)

                    # Save filtered data incrementally
                    filtered_gdf.drop(columns=["geometry"]).to_csv(output_path, mode="a", index=False)

            print(f"✔ Completed: {file} | Total Rows: {total_rows} | Kept: {filtered_rows}")

        except Exception as e:
            print(f"❌ Error processing {file}: {e}")

print("\n✅ All remaining problematic files processed! Check the 'remaining_processed_oslo_files' folder.")


🔄 Re-processing: hais_2024-01-05.csv...
✔ Completed: hais_2024-01-05.csv | Total Rows: 1340991 | Kept: 8761
🔄 Re-processing: hais_2024-02-13.csv...
✔ Completed: hais_2024-02-13.csv | Total Rows: 1293102 | Kept: 6441
🔄 Re-processing: hais_2024-02-14.csv...
✔ Completed: hais_2024-02-14.csv | Total Rows: 1275139 | Kept: 7096
🔄 Re-processing: hais_2024-02-15.csv...
✔ Completed: hais_2024-02-15.csv | Total Rows: 1350220 | Kept: 9271
🔄 Re-processing: hais_2024-02-16.csv...
✔ Completed: hais_2024-02-16.csv | Total Rows: 1355142 | Kept: 8852
🔄 Re-processing: hais_2024-02-19.csv...
✔ Completed: hais_2024-02-19.csv | Total Rows: 1152611 | Kept: 5291
🔄 Re-processing: hais_2024-02-22.csv...
✔ Completed: hais_2024-02-22.csv | Total Rows: 1364070 | Kept: 2150
🔄 Re-processing: hais_2024-04-03.csv...
✔ Completed: hais_2024-04-03.csv | Total Rows: 1145778 | Kept: 2394
🔄 Re-processing: hais_2024-04-04.csv...
✔ Completed: hais_2024-04-04.csv | Total Rows: 1261808 | Kept: 7046
🔄 Re-processing: hais_2024-0