In [1]:
# First remove some unnecessary columns. this gives some errors in some files. in this code I also got parquet files but I do not pursured those files later.

In [1]:
import pandas as pd
from pathlib import Path

# CONFIGURATION
input_dir = Path(".")  # assumes notebook is in same folder as raw CSVs
separate_csv_dir = Path("output_csv_separate")
separate_parquet_dir = Path("output_parquet_separate")
compiled_csv_path = Path("output_csv_compiled/compiled.csv")
compiled_parquet_path = Path("output_parquet_compiled/compiled.parquet")

# Create output directories
separate_csv_dir.mkdir(parents=True, exist_ok=True)
separate_parquet_dir.mkdir(parents=True, exist_ok=True)
compiled_csv_path.parent.mkdir(parents=True, exist_ok=True)
compiled_parquet_path.parent.mkdir(parents=True, exist_ok=True)

# Columns to drop
columns_to_drop = ["data_source", "ais_class", "hex_7", "hex_14", "geometry"]

# Init
compiled_csv = []
compiled_parquet = []
log = []

# PROCESS EACH FILE
for file in sorted(input_dir.glob("*.csv")):
    try:
        print(f"Processing {file.name} ...")
        df = pd.read_csv(file)
        rows_before = len(df)

        df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True, errors='ignore')
        rows_after = len(df)

        # Save as individual CSV and Parquet
        filename = file.stem
        df.to_csv(separate_csv_dir / f"{filename}.csv", index=False)
        df.to_parquet(separate_parquet_dir / f"{filename}.parquet", index=False)

        # Append to master list
        compiled_csv.append(df)
        compiled_parquet.append(df)
        log.append((file.name, rows_before, rows_after))

    except Exception as e:
        log.append((file.name, "ERROR", str(e)))

# COMPILE ALL TO ONE FILE
if compiled_csv:
    pd.concat(compiled_csv, ignore_index=True).to_csv(compiled_csv_path, index=False)
if compiled_parquet:
    pd.concat(compiled_parquet, ignore_index=True).to_parquet(compiled_parquet_path, index=False)

# LOG OUTPUT
print("\n=== Processing Summary ===")
for fname, before, after in log:
    print(f"{fname}: rows before = {before}, rows after = {after}")
print(f"\nTotal rows compiled: {sum([x[2] for x in log if isinstance(x[2], int)])}")


Processing hais_2024-01-01.csv ...
Processing hais_2024-01-02.csv ...
Processing hais_2024-01-03.csv ...
Processing hais_2024-01-04.csv ...
Processing hais_2024-01-05.csv ...
Processing hais_2024-01-06.csv ...
Processing hais_2024-01-07.csv ...
Processing hais_2024-01-08.csv ...
Processing hais_2024-01-09.csv ...
Processing hais_2024-01-10.csv ...
Processing hais_2024-01-11.csv ...
Processing hais_2024-01-12.csv ...
Processing hais_2024-01-13.csv ...
Processing hais_2024-01-14.csv ...
Processing hais_2024-01-15.csv ...
Processing hais_2024-01-16.csv ...
Processing hais_2024-01-17.csv ...
Processing hais_2024-01-18.csv ...
Processing hais_2024-01-19.csv ...
Processing hais_2024-01-20.csv ...
Processing hais_2024-01-21.csv ...
Processing hais_2024-01-22.csv ...
Processing hais_2024-01-23.csv ...
Processing hais_2024-01-24.csv ...
Processing hais_2024-01-25.csv ...
Processing hais_2024-01-26.csv ...
Processing hais_2024-01-27.csv ...
Processing hais_2024-01-28.csv ...
Processing hais_2024

MemoryError: Unable to allocate 2.11 GiB for an array with shape (2, 141658434) and data type float64

In [5]:
import pandas as pd
from pathlib import Path

# --- INPUT SETTINGS ---
missing_files = [
    "hais_2024-01-05.csv", "hais_2024-02-13.csv", "hais_2024-02-14.csv",
    "hais_2024-02-15.csv", "hais_2024-02-16.csv", "hais_2024-02-19.csv",
    "hais_2024-02-22.csv", "hais_2024-04-03.csv", "hais_2024-04-04.csv",
    "hais_2024-04-05.csv", "hais_2024-04-26.csv", "hais_2024-04-30.csv"
]

columns_to_drop = ["data_source", "ais_class", "hex_7", "hex_14", "geometry"]

# --- OUTPUT FOLDERS ---
csv_output_dir = Path("output_csv_missing_files")
parquet_output_dir = Path("output_parquet_missing_files")

csv_output_dir.mkdir(exist_ok=True)
parquet_output_dir.mkdir(exist_ok=True)

# --- PROCESS FILES ---
print("🔁 Processing missing files...\n")

for file_name in missing_files:
    try:
        input_path = Path(file_name)
        if not input_path.exists():
            print(f"❌ File not found: {file_name}")
            continue

        print(f"📄 Reading {file_name} ...")
        df = pd.read_csv(input_path, on_bad_lines='skip')
        rows_before = len(df)

        df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True, errors='ignore')
        rows_after = len(df)

        # Output paths
        csv_output_path = csv_output_dir / file_name
        parquet_output_path = parquet_output_dir / file_name.replace(".csv", ".parquet")

        # Save filtered files
        df.to_csv(csv_output_path, index=False)
        df.to_parquet(parquet_output_path, index=False)

        print(f"✅ {file_name} - Rows before: {rows_before}, after: {rows_after}")
        print(f"   Saved CSV → {csv_output_path}")
        print(f"   Saved Parquet → {parquet_output_path}\n")

    except Exception as e:
        print(f"❌ Error processing {file_name}: {e}\n")


🔁 Processing missing files...

📄 Reading hais_2024-01-05.csv ...
❌ Error processing hais_2024-01-05.csv: Error tokenizing data. C error: EOF inside string starting at row 175732

📄 Reading hais_2024-02-13.csv ...
❌ Error processing hais_2024-02-13.csv: Error tokenizing data. C error: EOF inside string starting at row 1085892

📄 Reading hais_2024-02-14.csv ...
❌ Error processing hais_2024-02-14.csv: Error tokenizing data. C error: EOF inside string starting at row 738231

📄 Reading hais_2024-02-15.csv ...
❌ Error processing hais_2024-02-15.csv: Error tokenizing data. C error: EOF inside string starting at row 1341694

📄 Reading hais_2024-02-16.csv ...
❌ Error processing hais_2024-02-16.csv: Error tokenizing data. C error: EOF inside string starting at row 116773

📄 Reading hais_2024-02-19.csv ...
❌ Error processing hais_2024-02-19.csv: Error tokenizing data. C error: EOF inside string starting at row 475130

📄 Reading hais_2024-02-22.csv ...
❌ Error processing hais_2024-02-22.csv: Error

In [8]:
import os
import csv
import pandas as pd
from pathlib import Path

# Target files
missing_files = [
    "hais_2024-01-05.csv", "hais_2024-02-13.csv", "hais_2024-02-14.csv",
    "hais_2024-02-15.csv", "hais_2024-02-16.csv", "hais_2024-02-19.csv",
    "hais_2024-02-22.csv", "hais_2024-04-03.csv", "hais_2024-04-04.csv",
    "hais_2024-04-05.csv", "hais_2024-04-26.csv", "hais_2024-04-30.csv"
]

# Columns to drop
drop_cols = set(["data_source", "ais_class", "hex_7", "hex_14", "geometry"])

# Output dirs
base_dir = Path(".")
csv_output_dir = base_dir / "output_csv_missing_files_safe_v2"
parquet_output_dir = base_dir / "output_parquet_missing_files_safe_v2"
csv_output_dir.mkdir(exist_ok=True)
parquet_output_dir.mkdir(exist_ok=True)

# Process each file line-by-line
for file in missing_files:
    raw_path = base_dir / file
    csv_out_path = csv_output_dir / file
    parquet_out_path = parquet_output_dir / file.replace(".csv", ".parquet")

    print(f"🔄 Processing {file} ...")
    try:
        with open(raw_path, "r", encoding="utf-8", errors="ignore") as f_in, open(csv_out_path, "w", newline='', encoding="utf-8") as f_out:
            reader = csv.DictReader(f_in)
            # Remove unwanted columns from header
            clean_fields = [field for field in reader.fieldnames if field not in drop_cols]

            writer = csv.DictWriter(f_out, fieldnames=clean_fields)
            writer.writeheader()

            total = 0
            kept = 0
            for row in reader:
                total += 1
                try:
                    clean_row = {k: row[k] for k in clean_fields}
                    writer.writerow(clean_row)
                    kept += 1
                except Exception:
                    # Skip broken rows
                    continue

        # Save to Parquet
        print(f"📦 Reading filtered {file} for Parquet export...")
        df = pd.read_csv(csv_out_path)
        df.to_parquet(parquet_out_path, index=False)

        print(f"✅ Done: {file} | Rows Kept: {kept} / {total}\n")

    except Exception as e:
        print(f"❌ Failed to process {file} | {e}\n")


🔄 Processing hais_2024-01-05.csv ...
❌ Failed to process hais_2024-01-05.csv | field larger than field limit (131072)

🔄 Processing hais_2024-02-13.csv ...
❌ Failed to process hais_2024-02-13.csv | field larger than field limit (131072)

🔄 Processing hais_2024-02-14.csv ...
❌ Failed to process hais_2024-02-14.csv | field larger than field limit (131072)

🔄 Processing hais_2024-02-15.csv ...
❌ Failed to process hais_2024-02-15.csv | field larger than field limit (131072)

🔄 Processing hais_2024-02-16.csv ...
❌ Failed to process hais_2024-02-16.csv | field larger than field limit (131072)

🔄 Processing hais_2024-02-19.csv ...
❌ Failed to process hais_2024-02-19.csv | field larger than field limit (131072)

🔄 Processing hais_2024-02-22.csv ...
❌ Failed to process hais_2024-02-22.csv | field larger than field limit (131072)

🔄 Processing hais_2024-04-03.csv ...
❌ Failed to process hais_2024-04-03.csv | field larger than field limit (131072)

🔄 Processing hais_2024-04-04.csv ...
❌ Failed to

In [16]:
import os
import csv
import pandas as pd

# Folders
raw_data_folder = os.getcwd()
cleaned_csv_folder = "final_cleaned_csv"
cleaned_parquet_folder = "final_cleaned_parquet"
os.makedirs(cleaned_csv_folder, exist_ok=True)
os.makedirs(cleaned_parquet_folder, exist_ok=True)

# Bad files list
bad_files = [
    "hais_2024-01-05.csv", "hais_2024-02-13.csv", "hais_2024-02-14.csv",
    "hais_2024-02-15.csv", "hais_2024-02-16.csv", "hais_2024-02-19.csv",
    "hais_2024-02-22.csv", "hais_2024-04-03.csv", "hais_2024-04-04.csv",
    "hais_2024-04-05.csv", "hais_2024-04-26.csv", "hais_2024-04-30.csv"
]

columns_to_drop = ['data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry']

# Process each bad file
for file_name in bad_files:
    input_path = os.path.join(raw_data_folder, file_name)
    cleaned_csv_path = os.path.join(cleaned_csv_folder, file_name)
    cleaned_parquet_path = os.path.join(cleaned_parquet_folder, file_name.replace(".csv", ".parquet"))

    print(f"🔧 Cleaning {file_name} ...")

    try:
        # Open and clean manually
        with open(input_path, 'r', encoding='utf-8', errors='replace') as infile, \
             open(cleaned_csv_path, 'w', newline='', encoding='utf-8') as outfile:

            reader = csv.reader(infile)
            writer = csv.writer(outfile)

            # Try to read/write header
            try:
                headers = next(reader)
                writer.writerow(headers)
            except Exception as e:
                print(f"❌ Couldn't read header for {file_name}: {e}")
                continue

            # Process lines safely
            good_lines = 0
            bad_lines = 0
            for line in reader:
                try:
                    if len(line) == len(headers):  # Basic line length check
                        writer.writerow(line)
                        good_lines += 1
                    else:
                        bad_lines += 1
                except:
                    bad_lines += 1

        print(f"✅ Cleaned CSV saved ({good_lines} good lines, {bad_lines} bad lines)")

        # Now load with pandas and save parquet
        df = pd.read_csv(cleaned_csv_path, low_memory=False)

        # Drop specified columns if they exist
        df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True, errors='ignore')

        # Save parquet
        df.to_parquet(cleaned_parquet_path, index=False)

        print(f"✅ Parquet saved: {file_name}\n")

    except Exception as e:
        print(f"❌ Failed on {file_name} with error: {e}\n")


🔧 Cleaning hais_2024-01-05.csv ...
❌ Failed on hais_2024-01-05.csv with error: field larger than field limit (67108864)

🔧 Cleaning hais_2024-02-13.csv ...
✅ Cleaned CSV saved (1085891 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-13.csv

🔧 Cleaning hais_2024-02-14.csv ...
❌ Failed on hais_2024-02-14.csv with error: field larger than field limit (67108864)

🔧 Cleaning hais_2024-02-15.csv ...
✅ Cleaned CSV saved (1341693 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-15.csv

🔧 Cleaning hais_2024-02-16.csv ...
❌ Failed on hais_2024-02-16.csv with error: field larger than field limit (67108864)

🔧 Cleaning hais_2024-02-19.csv ...
❌ Failed on hais_2024-02-19.csv with error: field larger than field limit (67108864)

🔧 Cleaning hais_2024-02-22.csv ...
❌ Failed on hais_2024-02-22.csv with error: field larger than field limit (67108864)

🔧 Cleaning hais_2024-04-03.csv ...
❌ Failed on hais_2024-04-03.csv with error: field larger than field limit (67108864)

🔧 Cleaning hais_2024

In [18]:
import os
import pandas as pd
import csv
import sys

# Set safe maximum CSV field size
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int / 10)

# Define folder paths
raw_data_folder = os.getcwd()
csv_output_folder = os.path.join(raw_data_folder, "cleaned_remaining_csv")
parquet_output_folder = os.path.join(raw_data_folder, "cleaned_remaining_parquet")

# Ensure output folders exist
os.makedirs(csv_output_folder, exist_ok=True)
os.makedirs(parquet_output_folder, exist_ok=True)

# Files that previously failed
remaining_files = [
    "hais_2024-01-05.csv", "hais_2024-02-13.csv", "hais_2024-02-14.csv",
    "hais_2024-02-15.csv", "hais_2024-02-16.csv", "hais_2024-02-19.csv",
    "hais_2024-02-22.csv", "hais_2024-04-03.csv", "hais_2024-04-04.csv",
    "hais_2024-04-05.csv", "hais_2024-04-26.csv", "hais_2024-04-30.csv"
]

# Columns to drop
columns_to_drop = ['data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry']

def clean_and_process_file(file_name):
    file_path = os.path.join(raw_data_folder, file_name)
    output_csv_path = os.path.join(csv_output_folder, file_name)
    output_parquet_path = os.path.join(parquet_output_folder, file_name.replace(".csv", ".parquet"))

    print(f"\n🔧 Cleaning {file_name} ...")

    good_lines = []
    header = None
    bad_lines = 0

    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as infile:
            reader = csv.reader(infile)
            header = next(reader)
            for row_num, row in enumerate(reader, start=2):
                if len(row) == len(header):
                    good_lines.append(row)
                else:
                    bad_lines += 1

        if not good_lines:
            print(f"❌ No good lines found in {file_name}")
            return

        df = pd.DataFrame(good_lines, columns=header)

        # Drop columns if they exist
        df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True, errors='ignore')

        df.to_csv(output_csv_path, index=False)
        df.to_parquet(output_parquet_path, index=False)

        print(f"✅ Cleaned CSV saved ({len(good_lines)} good lines, {bad_lines} bad lines)")
        print(f"✅ Parquet saved: {file_name}")

    except Exception as e:
        print(f"❌ Failed on {file_name} with error: {e}")

# Run cleaning
for file in remaining_files:
    clean_and_process_file(file)



🔧 Cleaning hais_2024-01-05.csv ...
✅ Cleaned CSV saved (175731 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-01-05.csv

🔧 Cleaning hais_2024-02-13.csv ...
✅ Cleaned CSV saved (1085891 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-13.csv

🔧 Cleaning hais_2024-02-14.csv ...
✅ Cleaned CSV saved (738230 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-14.csv

🔧 Cleaning hais_2024-02-15.csv ...
✅ Cleaned CSV saved (1341693 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-15.csv

🔧 Cleaning hais_2024-02-16.csv ...
✅ Cleaned CSV saved (116772 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-16.csv

🔧 Cleaning hais_2024-02-19.csv ...
✅ Cleaned CSV saved (475129 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-19.csv

🔧 Cleaning hais_2024-02-22.csv ...
✅ Cleaned CSV saved (388655 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-02-22.csv

🔧 Cleaning hais_2024-04-03.csv ...
✅ Cleaned CSV saved (508156 good lines, 1 bad lines)
✅ Parquet saved: hais_2024-

In [20]:
import os
import csv
import pandas as pd

# Folders
raw_folder = os.getcwd()
csv_processed_folder = os.path.join(raw_folder, "output_csv_separate")
parquet_processed_folder = os.path.join(raw_folder, "output_parquet_separate")

# Only include .csv files from raw
raw_files = [f for f in os.listdir(raw_folder) if f.endswith(".csv") and f.startswith("hais_")]

# Summary list
summary = []

def count_csv_rows(filepath):
    with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
        return sum(1 for line in f) - 1  # exclude header

def count_parquet_rows(filepath):
    try:
        df = pd.read_parquet(filepath)
        return len(df)
    except:
        return None

for filename in sorted(raw_files):
    raw_path = os.path.join(raw_folder, filename)
    csv_proc_path = os.path.join(csv_processed_folder, filename)
    parquet_proc_path = os.path.join(parquet_processed_folder, filename.replace(".csv", ".parquet"))

    raw_rows = count_csv_rows(raw_path)
    csv_rows = count_csv_rows(csv_proc_path) if os.path.exists(csv_proc_path) else None
    parquet_rows = count_parquet_rows(parquet_proc_path) if os.path.exists(parquet_proc_path) else None

    summary.append({
        "File": filename,
        "Raw Rows": raw_rows,
        "Processed CSV Rows": csv_rows,
        "Processed Parquet Rows": parquet_rows,
        "Removed Rows": raw_rows - csv_rows if csv_rows is not None else None
    })

# Convert to DataFrame for display/export
summary_df = pd.DataFrame(summary)
summary_df.to_csv("row_comparison_summary.csv", index=False)

print("✅ Summary created: row_comparison_summary.csv")
print(summary_df)


✅ Summary created: row_comparison_summary.csv
                    File  Raw Rows  Processed CSV Rows  \
0    hais_2024-01-01.csv   1264740             1264740   
1    hais_2024-01-02.csv   1168608             1168608   
2    hais_2024-01-03.csv   1213077             1213077   
3    hais_2024-01-04.csv   1294664             1294664   
4    hais_2024-01-05.csv   1340991              175778   
..                   ...       ...                 ...   
116  hais_2024-04-26.csv   1196245             1155577   
117  hais_2024-04-27.csv   1290112             1290112   
118  hais_2024-04-28.csv   1366811             1366811   
119  hais_2024-04-29.csv   1438768             1438768   
120  hais_2024-04-30.csv   1358923              574995   

     Processed Parquet Rows  Removed Rows  
0                   1264740             0  
1                   1168608             0  
2                   1213077             0  
3                   1294664             0  
4                    175731       116

In [1]:
import os
import csv
import pandas as pd
import sys
from pathlib import Path

# Fix field limit
csv.field_size_limit(2**31 - 1)

# Files to reprocess
problematic_files = [
    "hais_2024-01-05.csv", "hais_2024-02-13.csv", "hais_2024-02-14.csv",
    "hais_2024-02-15.csv", "hais_2024-02-16.csv", "hais_2024-02-19.csv",
    "hais_2024-02-22.csv", "hais_2024-04-03.csv", "hais_2024-04-04.csv",
    "hais_2024-04-05.csv", "hais_2024-04-26.csv", "hais_2024-04-30.csv"
]

# Columns to drop
columns_to_drop = {'data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry'}

# Output folders
raw_folder = Path.cwd()
csv_out = raw_folder / "output_csv_cleaned_final"
parquet_out = raw_folder / "output_parquet_cleaned_final"
csv_out.mkdir(exist_ok=True)
parquet_out.mkdir(exist_ok=True)

summary = []
preview_bad_rows = []

def is_valid_row(row, expected_len):
    return len(row) == expected_len

def fix_row(row, expected_len):
    if len(row) > expected_len:
        return row[:expected_len - 1] + [' '.join(row[expected_len - 1:])]
    elif len(row) < expected_len:
        return row + [''] * (expected_len - len(row))
    return row

for fname in problematic_files:
    print(f"🔧 Processing {fname} ...")
    fpath = raw_folder / fname
    if not fpath.exists():
        print(f"❌ Missing: {fname}")
        continue

    try:
        with open(fpath, "r", encoding="utf-8", errors="replace", newline="") as f:
            reader = csv.reader(f)
            headers = next(reader)
            expected_len = len(headers)
            cleaned_rows = [headers]
            total = 0
            fixed = 0
            previewed = 0

            for row in reader:
                total += 1
                if is_valid_row(row, expected_len):
                    cleaned_rows.append(row)
                else:
                    fixed_row = fix_row(row, expected_len)
                    cleaned_rows.append(fixed_row)
                    fixed += 1
                    if previewed < 3:
                        preview_bad_rows.append(fixed_row)
                        previewed += 1

        # Convert to DataFrame and drop unwanted columns
        df = pd.DataFrame(cleaned_rows[1:], columns=cleaned_rows[0])
        dropped = [col for col in columns_to_drop if col in df.columns]
        df.drop(columns=dropped, inplace=True)

        # Save to CSV
        csv_path = csv_out / fname
        df.to_csv(csv_path, index=False)

        # Save to Parquet
        parquet_path = parquet_out / fname.replace(".csv", ".parquet")
        df.to_parquet(parquet_path, index=False)

        summary.append({
            "File": fname,
            "Original Rows": total,
            "Cleaned Rows": len(df),
            "Fixed Rows": fixed,
            "Dropped Cols": dropped
        })

        print(f"✅ Done: {fname} | Rows: {total} → {len(df)} | Fixed: {fixed} | Dropped: {dropped}")

    except Exception as e:
        print(f"❌ Error: {fname} | {e}")

# Final concise summary
print("\n📊 SUMMARY:")
print(f"{'File':<25}{'Original':>10}{'Cleaned':>10}{'Fixed':>10}{'Dropped':>20}")
for s in summary:
    print(f"{s['File']:<25}{s['Original Rows']:>10,}{s['Cleaned Rows']:>10,}{s['Fixed Rows']:>10,}{str(s['Dropped Cols']):>20}")

# Show sample of fixed bad rows (limit to 3)
print("\n🧪 EXAMPLE FIXED ROWS (max 3):")
for i, row in enumerate(preview_bad_rows):
    print(f"{i+1:02d}: {row[:5]} ... [len={len(row)}]")


🔧 Processing hais_2024-01-05.csv ...
✅ Done: hais_2024-01-05.csv | Rows: 175732 → 175732 | Fixed: 1 | Dropped: ['data_source', 'hex_14', 'ais_class', 'geometry', 'hex_7']
🔧 Processing hais_2024-02-13.csv ...
✅ Done: hais_2024-02-13.csv | Rows: 1085892 → 1085892 | Fixed: 1 | Dropped: ['data_source', 'hex_14', 'ais_class', 'geometry', 'hex_7']
🔧 Processing hais_2024-02-14.csv ...
✅ Done: hais_2024-02-14.csv | Rows: 738231 → 738231 | Fixed: 1 | Dropped: ['data_source', 'hex_14', 'ais_class', 'geometry', 'hex_7']
🔧 Processing hais_2024-02-15.csv ...
✅ Done: hais_2024-02-15.csv | Rows: 1341694 → 1341694 | Fixed: 1 | Dropped: ['data_source', 'hex_14', 'ais_class', 'geometry', 'hex_7']
🔧 Processing hais_2024-02-16.csv ...
✅ Done: hais_2024-02-16.csv | Rows: 116773 → 116773 | Fixed: 1 | Dropped: ['data_source', 'hex_14', 'ais_class', 'geometry', 'hex_7']
🔧 Processing hais_2024-02-19.csv ...
✅ Done: hais_2024-02-19.csv | Rows: 475130 → 475130 | Fixed: 1 | Dropped: ['data_source', 'hex_14', 'ais

In [1]:
import os
import pandas as pd
from pathlib import Path

# Paths
base_path = Path.cwd()
raw_files = list(base_path.glob("hais_*.csv"))
filtered_csv_path = base_path / "output_csv_separate"
filtered_parquet_path = base_path / "output_parquet_separate"

# Summary data
summary = []

print("🔍 Comparing row counts...")

for raw_file in raw_files:
    filename = raw_file.name
    raw_count = 0
    csv_count = None
    parquet_count = None

    try:
        raw_count = sum(1 for _ in open(raw_file, encoding="utf-8", errors="replace")) - 1
    except Exception as e:
        print(f"❌ Error reading raw file: {filename} | {e}")

    filtered_csv_file = filtered_csv_path / filename
    filtered_parquet_file = filtered_parquet_path / filename.replace(".csv", ".parquet")

    if filtered_csv_file.exists():
        try:
            csv_count = sum(1 for _ in open(filtered_csv_file, encoding="utf-8", errors="replace")) - 1
        except Exception as e:
            print(f"❌ Error reading filtered CSV: {filtered_csv_file.name} | {e}")

    if filtered_parquet_file.exists():
        try:
            df_parquet = pd.read_parquet(filtered_parquet_file)
            parquet_count = len(df_parquet)
        except Exception as e:
            print(f"❌ Error reading Parquet: {filtered_parquet_file.name} | {e}")

    summary.append({
        "File": filename,
        "Raw Rows": raw_count,
        "Filtered CSV Rows": csv_count if csv_count is not None else "Missing",
        "Filtered Parquet Rows": parquet_count if parquet_count is not None else "Missing",
        "CSV Loss": raw_count - csv_count if csv_count is not None else "N/A",
        "Parquet Loss": raw_count - parquet_count if parquet_count is not None else "N/A"
    })

# Create summary DataFrame
summary_df = pd.DataFrame(summary)

# Save summary
summary_file = base_path / "row_comparison_summary.csv"
summary_df.to_csv(summary_file, index=False)

# Display small preview
print("\n📊 Row Comparison Summary (top 10):")
display(summary_df.head(10))

print(f"\n✅ Summary saved to: {summary_file}")


🔍 Comparing row counts...

📊 Row Comparison Summary (top 10):


Unnamed: 0,File,Raw Rows,Filtered CSV Rows,Filtered Parquet Rows,CSV Loss,Parquet Loss
0,hais_2024-01-01.csv,1264740,1264740,1264740,0,0
1,hais_2024-01-02.csv,1168608,1168608,1168608,0,0
2,hais_2024-01-03.csv,1213077,1213077,1213077,0,0
3,hais_2024-01-04.csv,1294664,1294664,1294664,0,0
4,hais_2024-01-05.csv,1340991,1340992,175732,-1,1165259
5,hais_2024-01-06.csv,1221781,1221781,1221781,0,0
6,hais_2024-01-07.csv,1123060,1123060,1123060,0,0
7,hais_2024-01-08.csv,1208519,1208519,1208519,0,0
8,hais_2024-01-09.csv,1198969,1198969,1198969,0,0
9,hais_2024-01-10.csv,1238576,1238576,1238576,0,0



✅ Summary saved to: C:\Users\herox\Documents\Thesis Work\Norway Ports Data Filtered\row_comparison_summary.csv
