In [13]:
import os
import pandas as pd

input_folder = r'D:\Thesis Work MLS\Norwagian ports data'
output_folder = r'D:\Thesis Work MLS\Norway Data Filtered'
columns_to_remove = ['data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry']
os.makedirs(output_folder, exist_ok=True)

start_date = pd.to_datetime('2024-01-01')
end_date = pd.to_datetime('2024-04-30')

def file_in_date_range(filename):
    if filename.startswith('hais_') and filename.endswith('.csv'):
        try:
            file_date = pd.to_datetime(filename[5:15])
            return start_date <= file_date <= end_date
        except:
            return False
    return False

files = [f for f in os.listdir(input_folder) if file_in_date_range(f)]
files = sorted(files)

for file in files:
    file_path = os.path.join(input_folder, file)
    out_path = os.path.join(output_folder, file)
    row_count = 0
    write_header = True

    try:
        chunk_iter = pd.read_csv(
            file_path,
            chunksize=900000,
            dtype=str,
            engine='python',      # <---- IMPORTANT FIX
            escapechar='\\',      # <---- IMPORTANT FIX
        )
        for chunk in chunk_iter:
            chunk = chunk.drop(columns=columns_to_remove, errors='ignore')
            row_count += len(chunk)
            chunk.to_csv(out_path, mode='a', index=False, header=write_header)
            write_header = False
        print(f"{file}: {row_count} rows ✓")
    except Exception as e:
        print(f"❌ Error processing {file}: {e}")

print("All done!")


hais_2024-01-01.csv: 1264740 rows ✓
hais_2024-01-02.csv: 1168608 rows ✓
hais_2024-01-03.csv: 1213077 rows ✓
hais_2024-01-04.csv: 1294664 rows ✓
hais_2024-01-05.csv: 1340991 rows ✓
hais_2024-01-06.csv: 1221781 rows ✓
hais_2024-01-07.csv: 1123060 rows ✓
hais_2024-01-08.csv: 1208519 rows ✓
hais_2024-01-09.csv: 1198969 rows ✓
hais_2024-01-10.csv: 1238576 rows ✓
hais_2024-01-11.csv: 1295277 rows ✓
hais_2024-01-12.csv: 1317674 rows ✓
hais_2024-01-13.csv: 1455110 rows ✓
hais_2024-01-14.csv: 1351421 rows ✓
hais_2024-01-15.csv: 1400088 rows ✓
hais_2024-01-16.csv: 1190779 rows ✓
hais_2024-01-17.csv: 1253710 rows ✓
hais_2024-01-18.csv: 1377242 rows ✓
hais_2024-01-19.csv: 1412424 rows ✓
hais_2024-01-20.csv: 1424547 rows ✓
hais_2024-01-21.csv: 1276551 rows ✓
hais_2024-01-22.csv: 1250247 rows ✓
hais_2024-01-23.csv: 1221990 rows ✓
hais_2024-01-24.csv: 1230476 rows ✓
hais_2024-01-25.csv: 1288063 rows ✓
hais_2024-01-26.csv: 1274393 rows ✓
hais_2024-01-27.csv: 1381276 rows ✓
hais_2024-01-28.csv: 1319085