In [1]:
import geopandas as gpd
import pandas as pd 
import glob
import os


In [None]:
csv_files = glob.glob(rf'D:\Datasets\Illegal Fishing\Original Data\Ship data\*.csv')

df_list = []
# Loop through each file and add the shiptype column
for file in csv_files:

    df = pd.read_csv(file)
    shiptype = os.path.basename(file).replace('.csv', '')

     # Drop rows where is_fishing = -1
    df = df[df['is_fishing'] != -1]

    df['shiptype'] = shiptype
    df_list.append(df)

ship_data = pd.concat(df_list, ignore_index=True)

ship_data['timestamp'] = pd.to_datetime(ship_data['timestamp'], unit='s')
if 'Unnamed: 0' in ship_data.columns:
    ship_data.drop(columns=['Unnamed: 0'], inplace=True)


ship_data.to_csv(rf'D:\Datasets\Illegal Fishing\Processed Data\Complete_ship_data.csv')

In [None]:

# File paths
gdb_path = rf'D:\Datasets\Illegal Fishing\Original Data\WDPA_Jun2024_Public.gdb'
gdb_path_ocean = rf'D:\Datasets\Illegal Fishing\Original Data\ne_110m_ocean\ne_110m_ocean.shp'
gdb_path_lakes = rf'D:\Datasets\Illegal Fishing\Original Data\ne_110m_rivers_lake_centerlines\ne_110m_rivers_lake_centerlines.shp'

# Read the ocean and lakes shapefiles
ocean_data = gpd.read_file(gdb_path_ocean)
lake_data = gpd.read_file(gdb_path_lakes)

# Combine ocean and lakes into one GeoDataFrame (union of the geometries)
water_bodies = pd.concat([ocean_data, lake_data])

# Read the polygon and point data
poly_data = gpd.read_file(gdb_path, layer='WDPA_poly_Jun2024')
point_data = gpd.read_file(gdb_path, layer='WDPA_point_Jun2024')

# Define paths for filtered outputs
poly_filtered_path = rf'D:\Datasets\Illegal Fishing\Processed Data\Filtered_Poly.shp'
point_filtered_path = rf'D:\Datasets\Illegal Fishing\Processed Data\Filtered_Point.shp'

# Manually chunk the poly_data and point_data
def process_in_chunks(data, water_bodies, output_path, chunk_size=1000):
    total_len = len(data)
    for start in range(0, total_len, chunk_size):
        end = min(start + chunk_size, total_len)
        chunk = data.iloc[start:end]

        # Spatial join for each chunk
        chunk_within_water = gpd.sjoin(chunk, water_bodies, how='inner', predicate='intersects')

        # Save chunk to file (append mode after the first chunk)
        chunk_within_water.to_file(output_path, mode='a' if start > 0 else 'w')

# Process polygon data in chunks
process_in_chunks(poly_data, water_bodies, poly_filtered_path)

# Process point data in chunks
process_in_chunks(point_data, water_bodies, point_filtered_path)

print("Filtered polygon and point data saved.")


In [None]:

# File paths for the original shapefiles
poly_file_path = r'D:\Datasets\Illegal Fishing\Processed Data\Filtered_Poly.shp'
point_file_path = r'D:\Datasets\Illegal Fishing\Processed Data\Filtered_Point.shp'

# Load the shapefiles
gdf_poly = gpd.read_file(poly_file_path)
gdf_point = gpd.read_file(point_file_path)

# Select only the 'geometry' and 'status_yr' columns
gdf_poly_reduced = gdf_poly[['geometry', 'STATUS_YR']]
gdf_point_reduced = gdf_point[['geometry', 'STATUS_YR']]

# Ensure the CRS is set to EPSG:4326
gdf_poly_reduced = gdf_poly_reduced.to_crs(epsg=4326)
gdf_point_reduced = gdf_point_reduced.to_crs(epsg=4326)

# Save the edited shapefiles
Updated_poly_file_path = r'D:\Datasets\Illegal Fishing\Processed Data\Filtered MPA data\Filtered_Poly.shp'
Updated_point_file_path = r'D:\Datasets\Illegal Fishing\Processed Data\Filtered MPA data\\Filtered_Point.shp'

gdf_poly_reduced.to_file(Updated_poly_file_path)  # Overwrite the original file
gdf_point_reduced.to_file(Updated_point_file_path)  # Overwrite the original file

print("Shapefiles edited and saved successfully.")

