In [32]:
import geopandas as gpd
import pandas as pd 
import glob
import os
from shapely.geometry import Point



In [33]:
csv_files = glob.glob(rf'D:\Datasets\Illegal Fishing\Original Data\Ship data\*.csv')

df_list = []

# Loop through each file and add the shiptype column
for file in csv_files:

    df = pd.read_csv(file)
    shiptype = os.path.basename(file).replace('.csv', '')

    # Drop rows where is_fishing = -1
    df = df[df['is_fishing'] != -1]

    df['shiptype'] = shiptype
    df_list.append(df)

ship_data = pd.concat(df_list, ignore_index=True)

ship_data['timestamp'] = pd.to_datetime(ship_data['timestamp'], unit='s')
if 'Unnamed: 0' in ship_data.columns:
    ship_data.drop(columns=['Unnamed: 0'], inplace=True)


In [34]:
# Define the custom function for mapping ranges
def map_range(x):
    if 0 <= x < 0.5 :
        return 0
    elif 0.5 <= x <= 1:
        return 1
     
ship_data['is_fishing'] = ship_data['is_fishing'].apply(map_range)

# Define the mapping
gear_type_mapping = {
    'drifting_longlines': 0,
    'fixed_gear': 1,
    'pole_and_line': 2,
    'purse_seines': 3,
    'trawlers': 4,
    'trollers': 5,
    'unknown': 6
}


ship_data['gear_type_encoded'] = ship_data['shiptype'].map(gear_type_mapping)

ship_data.to_csv(rf'D:\Datasets\Illegal Fishing\Processed Data\Complete_ship_data.csv',index=False)
ship_data['is_fishing'].value_counts()

is_fishing
0    300766
1    253089
Name: count, dtype: int64

In [None]:
# File paths
gdb_path_ocean = rf'D:\Datasets\Illegal Fishing\Original Data\ne_110m_ocean\ne_110m_ocean.shp'
ocean_data = gpd.read_file(gdb_path_ocean)


geometry = [Point(lon, lat) for lon, lat in zip(ship_data['lon'], ship_data['lat'])]
ship_gdf = gpd.GeoDataFrame(ship_data, geometry=geometry, crs='EPSG:4326')  

filtered_ships = gpd.sjoin(ship_gdf, ocean_data, how='inner', predicate='within')



In [None]:

# Keep certain cols
keep_cols = ['vessel_id', 'speed', 'distance_from_shore', 'distance_from_port', 
                 'lat', 'lon', 'prediction', 'status', 'geometry']
    
filtered_ships = filtered_ships.loc[:, keep_cols]

# Save the filtered DataFrame to CSV
filtered_ships_path = rf'D:\Datasets\Illegal Fishing\Processed Data\Filtered_Ships_in_Ocean.csv'
filtered_ships.to_csv(filtered_ships_path, index=False)

print("Filtered ship data saved as CSV.")

# Check the columns after dropping
print(filtered_ships.columns)


Filtered ship data saved as CSV.
Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon', 'is_fishing', 'source', 'shiptype',
       'gear_type_encoded', 'geometry'],
      dtype='object')


In [None]:

mpa_data = gpd.read_file(r"D:\Datasets\Illegal Fishing\Original Data\mpatlas_export_geo_202503_BhMDKpV\mpatlas_export_geo_202503\mpatlas_export_geo_202503\mpatlas_export_geo_202503\mpatlas_export_mar2025.gdb", layer='zoneassessment_geom').to_crs(epsg=4326)


intersections = gpd.sjoin(filtered_ships, mpa_data, how='inner', predicate='within')
intersecting_indices = intersections.index.unique()

# Assign 'illegal' status
def assign_illegal_status(row):
    if row.name in intersecting_indices:
        return 'yes' if row['is_fishing'] == 1 else 'maybe'
    return 'no'

filtered_ships['illegal'] = filtered_ships.apply(assign_illegal_status, axis=1)

# Save result
filtered_ships.drop(columns='geometry').to_csv(rf'D:\Datasets\Illegal Fishing\Processed Data\Filtered_Ships_with_Illegal_Status.csv', index=False)
